diff --git a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c index 877d83c0fa395..caa803ee79460 100644 --- a/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,10 +8,7 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { @@ -20,10 +17,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ @@ -32,19 +26,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ // CHECK-LABEL: @test_vbfdot_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ @@ -53,19 +38,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ // CHECK-LABEL: @test_vbfdotq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 16 -// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -74,19 +50,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b // CHECK-LABEL: @test_vbfdot_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 16 -// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { @@ -95,19 +62,10 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) // CHECK-LABEL: @test_vbfdotq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_126:%.*]] = alloca <4 x float>, align 16 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -116,11 +74,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_F323_I]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -129,11 +83,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -142,11 +92,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -155,27 +101,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -184,27 +111,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlalbq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4 -// CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6 -// CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -213,27 +121,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t // CHECK-LABEL: @test_vbfmlaltq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -242,27 +131,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlaltq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGETQ_LANE]], i32 0 -// CHECK-NEXT: [[VGETQ_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGETQ_LANE3]], i32 1 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGETQ_LANE8]], i32 2 -// CHECK-NEXT: [[VGETQ_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGETQ_LANE13]], i32 3 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGETQ_LANE18]], i32 4 -// CHECK-NEXT: [[VGETQ_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGETQ_LANE23]], i32 5 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGETQ_LANE28]], i32 6 -// CHECK-NEXT: [[VGETQ_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGETQ_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { diff --git a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c index 9da2cd5af3221..5455850e6d0f0 100644 --- a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -17,10 +17,8 @@ bfloat16x4_t test_vcreate_bf16(uint64_t a) { // CHECK-LABEL: @test_vdup_n_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x bfloat> [[VECINIT_I]], <4 x bfloat> poison, <4 x i32> zeroinitializer // CHECK-NEXT: ret <4 x bfloat> [[VECINIT3_I]] // bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { @@ -29,14 +27,8 @@ bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdupq_n_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x bfloat> [[VECINIT_I]], <8 x bfloat> poison, <8 x i32> zeroinitializer // CHECK-NEXT: ret <8 x bfloat> [[VECINIT7_I]] // bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { @@ -45,9 +37,7 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdup_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { @@ -56,9 +46,7 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdupq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { @@ -67,9 +55,7 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdup_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { @@ -78,9 +64,7 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vdupq_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { @@ -98,7 +82,7 @@ bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { // CHECK-LABEL: @test_vget_high_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { @@ -107,7 +91,7 @@ bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { // CHECK-LABEL: @test_vget_low_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { @@ -116,7 +100,7 @@ bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { // CHECK-LABEL: @test_vget_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { @@ -125,7 +109,7 @@ bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vgetq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7 // CHECK-NEXT: ret bfloat [[VGETQ_LANE]] // bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { @@ -134,7 +118,7 @@ bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vset_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 1 // CHECK-NEXT: ret <4 x bfloat> [[VSET_LANE]] // bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { @@ -143,7 +127,7 @@ bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { // CHECK-LABEL: @test_vsetq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 7 // CHECK-NEXT: ret <8 x bfloat> [[VSET_LANE]] // bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { @@ -152,7 +136,7 @@ bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { // CHECK-LABEL: @test_vduph_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { @@ -161,7 +145,7 @@ bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vduph_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7 // CHECK-NEXT: ret bfloat [[VGETQ_LANE]] // bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) { diff --git a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c index 2b271ac88462b..62d2ca48f3ffe 100644 --- a/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c +++ b/clang/test/CodeGen/AArch64/bf16-reinterpret-intrinsics.c @@ -1,333 +1,388 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: @test_vreinterpret_bf16_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s8(int8x8_t a) { return vreinterpret_bf16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s16(int16x4_t a) { return vreinterpret_bf16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s32(int32x2_t a) { return vreinterpret_bf16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_f32(float32x2_t a) { return vreinterpret_bf16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u8(uint8x8_t a) { return vreinterpret_bf16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u16(uint16x4_t a) { return vreinterpret_bf16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u32(uint32x2_t a) { return vreinterpret_bf16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_p8(poly8x8_t a) { return vreinterpret_bf16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_p16(poly16x4_t a) { return vreinterpret_bf16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_u64(uint64x1_t a) { return vreinterpret_bf16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_s64(int64x1_t a) { return vreinterpret_bf16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s8(int8x16_t a) { return vreinterpretq_bf16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s16(int16x8_t a) { return vreinterpretq_bf16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s32(int32x4_t a) { return vreinterpretq_bf16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_f32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_f32(float32x4_t a) { return vreinterpretq_bf16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u8(uint8x16_t a) { return vreinterpretq_bf16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u16(uint16x8_t a) { return vreinterpretq_bf16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u32( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u32(uint32x4_t a) { return vreinterpretq_bf16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p8(poly8x16_t a) { return vreinterpretq_bf16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p16(poly16x8_t a) { return vreinterpretq_bf16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_u64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_u64(uint64x2_t a) { return vreinterpretq_bf16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_s64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_s64(int64x2_t a) { return vreinterpretq_bf16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_p64(poly64x1_t a) { return vreinterpret_bf16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p64(poly64x2_t a) { return vreinterpretq_bf16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_p128( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_p128( +// CHECK-SAME: i128 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_p128(poly128_t a) { return vreinterpretq_bf16_p128(a); } -// CHECK-LABEL: @test_vreinterpret_bf16_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <4 x bfloat> +// CHECK-LABEL: define dso_local <4 x bfloat> @test_vreinterpret_bf16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x bfloat> // CHECK-NEXT: ret <4 x bfloat> [[TMP0]] // bfloat16x4_t test_vreinterpret_bf16_f64(float64x1_t a) { return vreinterpret_bf16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_bf16_f64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <8 x bfloat> +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vreinterpretq_bf16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x bfloat> // CHECK-NEXT: ret <8 x bfloat> [[TMP0]] // bfloat16x8_t test_vreinterpretq_bf16_f64(float64x2_t a) { return vreinterpretq_bf16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8> // CHECK-NEXT: ret <8 x i8> [[TMP0]] // int8x8_t test_vreinterpret_s8_bf16(bfloat16x4_t a) { return vreinterpret_s8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // int16x4_t test_vreinterpret_s16_bf16(bfloat16x4_t a) { return vreinterpret_s16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32> +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32> // CHECK-NEXT: ret <2 x i32> [[TMP0]] // int32x2_t test_vreinterpret_s32_bf16(bfloat16x4_t a) { return vreinterpret_s32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x float> +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x float> // CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vreinterpret_f32_bf16(bfloat16x4_t a) { return vreinterpret_f32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8> // CHECK-NEXT: ret <8 x i8> [[TMP0]] // uint8x8_t test_vreinterpret_u8_bf16(bfloat16x4_t a) { return vreinterpret_u8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // uint16x4_t test_vreinterpret_u16_bf16(bfloat16x4_t a) { return vreinterpret_u16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <2 x i32> +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <2 x i32> // CHECK-NEXT: ret <2 x i32> [[TMP0]] // uint32x2_t test_vreinterpret_u32_bf16(bfloat16x4_t a) { return vreinterpret_u32_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <8 x i8> // CHECK-NEXT: ret <8 x i8> [[TMP0]] // poly8x8_t test_vreinterpret_p8_bf16(bfloat16x4_t a) { return vreinterpret_p8_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[TMP0]] // poly16x4_t test_vreinterpret_p16_bf16(bfloat16x4_t a) { return vreinterpret_p16_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64> // CHECK-NEXT: ret <1 x i64> [[TMP0]] // uint64x1_t test_vreinterpret_u64_bf16(bfloat16x4_t a) { return vreinterpret_u64_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64> // CHECK-NEXT: ret <1 x i64> [[TMP0]] // int64x1_t test_vreinterpret_s64_bf16(bfloat16x4_t a) { return vreinterpret_s64_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x i64> +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x i64> // CHECK-NEXT: ret <1 x i64> [[TMP0]] // poly64x1_t test_vreinterpret_p64_bf16(bfloat16x4_t a) { return vreinterpret_p64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] // int8x16_t test_vreinterpretq_s8_bf16(bfloat16x8_t a) { return vreinterpretq_s8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // int16x8_t test_vreinterpretq_s16_bf16(bfloat16x8_t a) { return vreinterpretq_s16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32> +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] // int32x4_t test_vreinterpretq_s32_bf16(bfloat16x8_t a) { return vreinterpretq_s32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x float> +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x float> // CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vreinterpretq_f32_bf16(bfloat16x8_t a) { return vreinterpretq_f32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] // uint8x16_t test_vreinterpretq_u8_bf16(bfloat16x8_t a) { return vreinterpretq_u8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // uint16x8_t test_vreinterpretq_u16_bf16(bfloat16x8_t a) { return vreinterpretq_u16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <4 x i32> +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] // uint32x4_t test_vreinterpretq_u32_bf16(bfloat16x8_t a) { return vreinterpretq_u32_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] // poly8x16_t test_vreinterpretq_p8_bf16(bfloat16x8_t a) { return vreinterpretq_p8_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <8 x i16> +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] // poly16x8_t test_vreinterpretq_p16_bf16(bfloat16x8_t a) { return vreinterpretq_p16_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // uint64x2_t test_vreinterpretq_u64_bf16(bfloat16x8_t a) { return vreinterpretq_u64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // int64x2_t test_vreinterpretq_s64_bf16(bfloat16x8_t a) { return vreinterpretq_s64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x i64> +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] // poly64x2_t test_vreinterpretq_p64_bf16(bfloat16x8_t a) { return vreinterpretq_p64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_p128_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to i128 +// CHECK-LABEL: define dso_local i128 @test_vreinterpretq_p128_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] // poly128_t test_vreinterpretq_p128_bf16(bfloat16x8_t a) { return vreinterpretq_p128_bf16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <1 x double> +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_bf16( +// CHECK-SAME: <4 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[A]] to <1 x double> // CHECK-NEXT: ret <1 x double> [[TMP0]] // float64x1_t test_vreinterpret_f64_bf16(bfloat16x4_t a) { return vreinterpret_f64_bf16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_bf16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <2 x double> +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_bf16( +// CHECK-SAME: <8 x bfloat> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[A]] to <2 x double> // CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vreinterpretq_f64_bf16(bfloat16x8_t a) { return vreinterpretq_f64_bf16(a); } diff --git a/clang/test/CodeGen/AArch64/neon-2velem.c b/clang/test/CodeGen/AArch64/neon-2velem.c index 75bdeb92fd9ca..1d28b48f29bf7 100644 --- a/clang/test/CodeGen/AArch64/neon-2velem.c +++ b/clang/test/CodeGen/AArch64/neon-2velem.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -7,9 +7,7 @@ // CHECK-LABEL: @test_vmla_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -20,9 +18,7 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -33,9 +29,7 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -46,9 +40,7 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -59,9 +51,7 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -72,9 +62,7 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -85,9 +73,7 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -98,9 +84,7 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -111,9 +95,7 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -124,9 +106,7 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -137,9 +117,7 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -150,9 +128,7 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -163,9 +139,7 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -176,9 +150,7 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -189,9 +161,7 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -202,9 +172,7 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -215,9 +183,7 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -227,9 +193,7 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -239,9 +203,7 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -251,9 +213,7 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -263,9 +223,7 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -275,9 +233,7 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -287,9 +243,7 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -299,9 +253,7 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -311,9 +263,7 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -323,9 +273,7 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -335,9 +283,7 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -347,9 +293,7 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -359,9 +303,7 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -371,9 +313,7 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -383,9 +323,7 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -395,9 +333,7 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -407,14 +343,8 @@ uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vfma_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[LANE]], <2 x float> [[A:%.*]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { @@ -423,14 +353,8 @@ float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmaq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[LANE]], <4 x float> [[A:%.*]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { @@ -439,15 +363,9 @@ float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfma_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B:%.*]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vfma_laneq_f32(a, b, v, 3); @@ -455,15 +373,9 @@ float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, 3); @@ -472,14 +384,8 @@ float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: @test_vfms_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG]], <2 x float> [[LANE]], <2 x float> [[A:%.*]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { @@ -489,14 +395,8 @@ float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG]], <4 x float> [[LANE]], <4 x float> [[A:%.*]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { @@ -506,15 +406,9 @@ float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfms_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[FNEG]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vfms_laneq_f32(a, b, v, 3); @@ -523,15 +417,9 @@ float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[FNEG]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, 3); @@ -539,14 +427,8 @@ float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B:%.*]], <2 x double> [[LANE]], <2 x double> [[A:%.*]]) // CHECK-NEXT: ret <2 x double> [[FMLA2]] // float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { @@ -555,15 +437,9 @@ float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B:%.*]], <2 x double> [[A:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmaq_laneq_f64(a, b, v, 1); @@ -572,14 +448,8 @@ float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG]], <2 x double> [[LANE]], <2 x double> [[A:%.*]]) // CHECK-NEXT: ret <2 x double> [[FMLA2]] // float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { @@ -589,15 +459,9 @@ float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[FNEG]], <2 x double> [[A:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmsq_laneq_f64(a, b, v, 1); @@ -605,7 +469,7 @@ float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) { // CHECK-LABEL: @test_vfmas_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i64 3 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -616,7 +480,7 @@ float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsd_lane_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i32 0 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i64 0 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]]) // CHECK-NEXT: ret double [[TMP0]] // @@ -627,7 +491,7 @@ float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) { // CHECK-LABEL: @test_vfmss_laneq_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B:%.*]] -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i32 3 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[V:%.*]], i64 3 // CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]]) // CHECK-NEXT: ret float [[TMP0]] // @@ -638,7 +502,7 @@ float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsd_laneq_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg double [[B:%.*]] -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i32 1 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 1 // CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[FNEG]], double [[EXTRACT]], double [[A:%.*]]) // CHECK-NEXT: ret double [[TMP0]] // @@ -648,12 +512,8 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { // CHECK-LABEL: @test_vmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -663,12 +523,8 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -678,12 +534,8 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -693,12 +545,8 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -708,12 +556,8 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -724,12 +568,8 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -740,12 +580,8 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -756,12 +592,8 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -772,12 +604,8 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -787,12 +615,8 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -802,12 +626,8 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -817,12 +637,8 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -832,12 +648,8 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -848,12 +660,8 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -864,12 +672,8 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -880,12 +684,8 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -896,12 +696,8 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -911,12 +707,8 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -926,12 +718,8 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -941,12 +729,8 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -956,12 +740,8 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -972,12 +752,8 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -988,12 +764,8 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -1004,12 +776,8 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -1020,12 +788,8 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1035,12 +799,8 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1050,12 +810,8 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1065,12 +821,8 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1080,12 +832,8 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -1096,12 +844,8 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -1112,12 +856,8 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -1128,12 +868,8 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -1144,12 +880,8 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1158,12 +890,8 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1172,12 +900,8 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1186,12 +910,8 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1200,12 +920,8 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -1215,12 +931,8 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -1230,12 +942,8 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -1245,12 +953,8 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -1260,12 +964,8 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1274,12 +974,8 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1288,12 +984,8 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1302,12 +994,8 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1316,12 +1004,8 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -1331,12 +1015,8 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -1346,12 +1026,8 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -1361,12 +1037,8 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -1376,14 +1048,9 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1392,14 +1059,9 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1408,15 +1070,10 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1425,15 +1082,10 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1442,14 +1094,9 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -1458,14 +1105,9 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -1474,15 +1116,10 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -1491,15 +1128,10 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -1508,13 +1140,8 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1523,13 +1150,8 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1538,13 +1160,8 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1553,13 +1170,8 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1568,14 +1180,9 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1584,14 +1191,9 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1600,14 +1202,9 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1616,14 +1213,9 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1632,11 +1224,7 @@ int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmulh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 3) +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3) // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] // int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { @@ -1645,11 +1233,7 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmulhq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 3) +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3) // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] // int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { @@ -1658,11 +1242,7 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmulh_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 1) +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1) // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] // int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { @@ -1671,11 +1251,7 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmulhq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 1) +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] // int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { @@ -1684,11 +1260,7 @@ int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqrdmulh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 3) +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3) // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] // int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { @@ -1697,11 +1269,7 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqrdmulhq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 3) +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 3) // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] // int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { @@ -1710,11 +1278,7 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqrdmulh_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 1) +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1) // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] // int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { @@ -1723,11 +1287,7 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqrdmulhq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 1) +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 1) // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] // int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { @@ -1736,9 +1296,7 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -1749,14 +1307,11 @@ float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[V:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP2]] // float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { return vmul_lane_f64(a, v, 0); @@ -1764,9 +1319,7 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -1776,9 +1329,7 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -1788,9 +1339,7 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmul_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -1800,14 +1349,11 @@ float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 1 +// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP2]] // float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 1); @@ -1815,9 +1361,7 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -1827,9 +1371,7 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -1839,12 +1381,8 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1853,12 +1391,8 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1867,12 +1401,8 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1881,12 +1411,8 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1895,12 +1421,8 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1909,12 +1431,8 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1923,9 +1441,7 @@ float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmla_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -1936,9 +1452,7 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -1949,9 +1463,7 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -1962,9 +1474,7 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -1975,9 +1485,7 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -1988,9 +1496,7 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -2001,9 +1507,7 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -2014,9 +1518,7 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -2027,9 +1529,7 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -2040,9 +1540,7 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -2053,9 +1551,7 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -2066,9 +1562,7 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2079,9 +1573,7 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -2092,9 +1584,7 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -2105,9 +1595,7 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -2118,9 +1606,7 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2131,9 +1617,7 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -2143,9 +1627,7 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -2155,9 +1637,7 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -2167,9 +1647,7 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -2179,9 +1657,7 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -2191,9 +1667,7 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -2203,9 +1677,7 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -2215,9 +1687,7 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -2227,9 +1697,7 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -2239,9 +1707,7 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -2251,9 +1717,7 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -2263,9 +1727,7 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -2275,9 +1737,7 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // @@ -2287,9 +1747,7 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // @@ -2299,9 +1757,7 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // @@ -2311,9 +1767,7 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // @@ -2323,14 +1777,8 @@ uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vfma_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[LANE]], <2 x float> [[A:%.*]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { @@ -2339,14 +1787,8 @@ float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmaq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[LANE]], <4 x float> [[A:%.*]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { @@ -2355,15 +1797,9 @@ float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfma_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[B:%.*]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vfma_laneq_f32(a, b, v, 0); @@ -2371,15 +1807,9 @@ float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmaq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmaq_laneq_f32(a, b, v, 0); @@ -2388,14 +1818,8 @@ float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: @test_vfms_lane_f32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG]], <2 x float> [[LANE]], <2 x float> [[A:%.*]]) // CHECK-NEXT: ret <2 x float> [[FMLA2]] // float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { @@ -2405,14 +1829,8 @@ float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: @test_vfmsq_lane_f32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG]], <4 x float> [[LANE]], <4 x float> [[A:%.*]]) // CHECK-NEXT: ret <4 x float> [[FMLA2]] // float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { @@ -2422,15 +1840,9 @@ float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: @test_vfms_laneq_f32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]]) -// CHECK-NEXT: ret <2 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[FNEG]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vfms_laneq_f32(a, b, v, 0); @@ -2439,15 +1851,9 @@ float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: @test_vfmsq_laneq_f32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]]) -// CHECK-NEXT: ret <4 x float> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[FNEG]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vfmsq_laneq_f32(a, b, v, 0); @@ -2455,15 +1861,9 @@ float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: @test_vfmaq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[B:%.*]], <2 x double> [[A:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmaq_laneq_f64(a, b, v, 0); @@ -2472,15 +1872,9 @@ float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-LABEL: @test_vfmsq_laneq_f64_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <2 x double> [[B:%.*]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]]) -// CHECK-NEXT: ret <2 x double> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[FNEG]], <2 x double> [[A:%.*]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) { return vfmsq_laneq_f64(a, b, v, 0); @@ -2488,12 +1882,8 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-LABEL: @test_vmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2503,12 +1893,8 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2518,12 +1904,8 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2533,12 +1915,8 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2548,12 +1926,8 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -2564,12 +1938,8 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -2580,12 +1950,8 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -2596,12 +1962,8 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -2612,12 +1974,8 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2627,12 +1985,8 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2642,12 +1996,8 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2657,12 +2007,8 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2672,12 +2018,8 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2688,12 +2030,8 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -2704,12 +2042,8 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2720,12 +2054,8 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -2736,12 +2066,8 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2751,12 +2077,8 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2766,12 +2088,8 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2781,12 +2099,8 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2796,12 +2110,8 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -2812,12 +2122,8 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -2828,12 +2134,8 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -2844,12 +2146,8 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] @@ -2860,12 +2158,8 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2875,12 +2169,8 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2890,12 +2180,8 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2905,12 +2191,8 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2920,12 +2202,8 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2936,12 +2214,8 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -2952,12 +2226,8 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -2968,12 +2238,8 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] @@ -2984,12 +2250,8 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -2998,12 +2260,8 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3012,12 +2270,8 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -3026,12 +2280,8 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -3040,12 +2290,8 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -3055,12 +2301,8 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -3070,12 +2312,8 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -3085,12 +2323,8 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -3100,12 +2334,8 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -3114,12 +2344,8 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -3128,12 +2354,8 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -3142,12 +2364,8 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -3156,12 +2374,8 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -3171,12 +2385,8 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -3186,12 +2396,8 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // @@ -3201,12 +2407,8 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // @@ -3216,14 +2418,9 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3232,14 +2429,9 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3248,15 +2440,10 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3265,15 +2452,10 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3282,14 +2464,9 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { @@ -3298,14 +2475,9 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { @@ -3314,15 +2486,10 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { @@ -3331,15 +2498,10 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { @@ -3348,13 +2510,8 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -3363,13 +2520,8 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3378,13 +2530,8 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[LANE]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -3393,13 +2540,8 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[LANE]]) // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -3408,14 +2550,9 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -3424,14 +2561,9 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -3440,14 +2572,9 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -3456,14 +2583,9 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -3472,11 +2594,7 @@ int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmulh_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[VQDMULH_LANE_V]], <4 x i16> [[VQDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANE_V2]] // int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -3485,11 +2603,7 @@ int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmulhq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[VQDMULHQ_LANE_V]], <4 x i16> [[VQDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANE_V2]] // int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -3498,11 +2612,7 @@ int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmulh_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[VQDMULH_LANE_V]], <2 x i32> [[VQDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANE_V2]] // int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3511,11 +2621,7 @@ int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmulhq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[VQDMULHQ_LANE_V]], <2 x i32> [[VQDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANE_V2]] // int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -3524,11 +2630,7 @@ int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqrdmulh_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[VQRDMULH_LANE_V]], <4 x i16> [[VQRDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANE_V2]] // int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -3537,11 +2639,7 @@ int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqrdmulhq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[VQRDMULHQ_LANE_V]], <4 x i16> [[VQRDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.lane.v8i16.v4i16(<8 x i16> [[A:%.*]], <4 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANE_V2]] // int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -3550,11 +2648,7 @@ int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqrdmulh_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_LANE_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[VQRDMULH_LANE_V]], <2 x i32> [[VQRDMULH_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULH_LANE_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v2i32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANE_V2]] // int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -3563,11 +2657,7 @@ int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqrdmulhq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[VQRDMULHQ_LANE_V]], <2 x i32> [[VQRDMULHQ_LANE_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULHQ_LANE_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.lane.v4i32.v2i32(<4 x i32> [[A:%.*]], <2 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANE_V2]] // int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -3576,9 +2666,7 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -3588,9 +2676,7 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -3600,9 +2686,7 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // @@ -3612,14 +2696,11 @@ float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]] -// CHECK-NEXT: [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double> -// CHECK-NEXT: ret <1 x double> [[TMP5]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A:%.*]], i64 0 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V:%.*]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[EXTRACT]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP2]] // float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 0); @@ -3627,9 +2708,7 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // @@ -3639,9 +2718,7 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // @@ -3651,12 +2728,8 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3665,12 +2738,8 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3679,12 +2748,8 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3693,12 +2758,8 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A:%.*]], <2 x float> [[LANE]]) // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3707,12 +2768,8 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A:%.*]], <4 x float> [[LANE]]) // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3721,12 +2778,8 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A:%.*]], <2 x double> [[LANE]]) // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3735,13 +2788,9 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmull_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // @@ -3751,11 +2800,9 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) { // CHECK-LABEL: @test_vmull_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // @@ -3765,13 +2812,9 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) { // CHECK-LABEL: @test_vmull_high_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // @@ -3781,11 +2824,9 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) { // CHECK-LABEL: @test_vmull_high_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // @@ -3795,15 +2836,10 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) { // CHECK-LABEL: @test_vqdmull_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] // int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { @@ -3812,13 +2848,10 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) { // CHECK-LABEL: @test_vqdmull_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] // int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { @@ -3827,13 +2860,9 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) { // CHECK-LABEL: @test_vmlal_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] @@ -3844,11 +2873,9 @@ int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vmlal_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] @@ -3859,13 +2886,9 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmlal_high_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] @@ -3876,11 +2899,9 @@ uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { // CHECK-LABEL: @test_vmlal_high_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] @@ -3891,16 +2912,11 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { // CHECK-LABEL: @test_vqdmlal_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] // int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { @@ -3909,14 +2925,11 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vqdmlal_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] // int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { @@ -3925,13 +2938,9 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmlsl_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] @@ -3942,11 +2951,9 @@ int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vmlsl_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] @@ -3957,13 +2964,9 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmlsl_high_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] @@ -3974,11 +2977,9 @@ uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) { // CHECK-LABEL: @test_vmlsl_high_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] @@ -3989,16 +2990,11 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) { // CHECK-LABEL: @test_vqdmlsl_high_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] // int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { @@ -4007,14 +3003,11 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vqdmlsl_high_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] // int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { @@ -4023,8 +3016,8 @@ int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmul_n_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A:%.*]], [[VECINIT1_I]] // CHECK-NEXT: ret <2 x float> [[MUL_I]] // @@ -4034,10 +3027,8 @@ float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { // CHECK-LABEL: @test_vmulq_n_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A:%.*]], [[VECINIT3_I]] // CHECK-NEXT: ret <4 x float> [[MUL_I]] // @@ -4047,8 +3038,8 @@ float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { // CHECK-LABEL: @test_vmulq_n_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[A:%.*]], [[VECINIT1_I]] // CHECK-NEXT: ret <2 x double> [[MUL_I]] // @@ -4058,13 +3049,10 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) { // CHECK-LABEL: @test_vfma_n_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B:%.*]], <2 x float> [[VECINIT1_I]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfma_n_f32(a, b, n); @@ -4072,12 +3060,9 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-LABEL: @test_vfma_n_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) -// CHECK-NEXT: ret <1 x double> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i64 0 +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B:%.*]], <1 x double> [[VECINIT_I]], <1 x double> [[A:%.*]]) +// CHECK-NEXT: ret <1 x double> [[TMP0]] // float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { return vfma_n_f64(a, b, n); @@ -4085,15 +3070,10 @@ float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) { // CHECK-LABEL: @test_vfmaq_n_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B:%.*]], <4 x float> [[VECINIT3_I]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmaq_n_f32(a, b, n); @@ -4102,13 +3082,10 @@ float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { // CHECK-LABEL: @test_vfms_n_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B:%.*]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A:%.*]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfms_n_f32(a, b, n); @@ -4117,12 +3094,9 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-LABEL: @test_vfms_n_f64( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B:%.*]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i32 0 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) -// CHECK-NEXT: ret <1 x double> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x double> poison, double [[N:%.*]], i64 0 +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A:%.*]]) +// CHECK-NEXT: ret <1 x double> [[TMP0]] // float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { return vfms_n_f64(a, b, n); @@ -4131,15 +3105,10 @@ float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) { // CHECK-LABEL: @test_vfmsq_n_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B:%.*]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A:%.*]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmsq_n_f32(a, b, n); @@ -4147,10 +3116,8 @@ float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { // CHECK-LABEL: @test_vmul_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]] // CHECK-NEXT: ret <4 x i16> [[MUL_I]] // @@ -4160,14 +3127,8 @@ int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { // CHECK-LABEL: @test_vmulq_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]] // CHECK-NEXT: ret <8 x i16> [[MUL_I]] // @@ -4177,8 +3138,8 @@ int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { // CHECK-LABEL: @test_vmul_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]] // CHECK-NEXT: ret <2 x i32> [[MUL_I]] // @@ -4188,10 +3149,8 @@ int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { // CHECK-LABEL: @test_vmulq_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]] // CHECK-NEXT: ret <4 x i32> [[MUL_I]] // @@ -4201,10 +3160,8 @@ int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { // CHECK-LABEL: @test_vmul_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A:%.*]], [[VECINIT3_I]] // CHECK-NEXT: ret <4 x i16> [[MUL_I]] // @@ -4214,14 +3171,8 @@ uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { // CHECK-LABEL: @test_vmulq_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A:%.*]], [[VECINIT7_I]] // CHECK-NEXT: ret <8 x i16> [[MUL_I]] // @@ -4231,8 +3182,8 @@ uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { // CHECK-LABEL: @test_vmul_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A:%.*]], [[VECINIT1_I]] // CHECK-NEXT: ret <2 x i32> [[MUL_I]] // @@ -4242,10 +3193,8 @@ uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { // CHECK-LABEL: @test_vmulq_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A:%.*]], [[VECINIT3_I]] // CHECK-NEXT: ret <4 x i32> [[MUL_I]] // @@ -4255,13 +3204,9 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { // CHECK-LABEL: @test_vmull_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { @@ -4270,11 +3215,9 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { // CHECK-LABEL: @test_vmull_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { @@ -4283,13 +3226,9 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { // CHECK-LABEL: @test_vmull_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] // uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { @@ -4298,11 +3237,9 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { // CHECK-LABEL: @test_vmull_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] // uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { @@ -4311,14 +3248,9 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { // CHECK-LABEL: @test_vqdmull_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] // int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { @@ -4327,12 +3259,9 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { // CHECK-LABEL: @test_vqdmull_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] // int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { @@ -4341,14 +3270,9 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { // CHECK-LABEL: @test_vqdmulh_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] // int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { @@ -4357,18 +3281,9 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { // CHECK-LABEL: @test_vqdmulhq_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VECINIT7_I]]) // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] // int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { @@ -4377,12 +3292,9 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { // CHECK-LABEL: @test_vqdmulh_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] // int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { @@ -4391,14 +3303,9 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { // CHECK-LABEL: @test_vqdmulhq_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] // int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { @@ -4407,14 +3314,9 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { // CHECK-LABEL: @test_vqrdmulh_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] // int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { @@ -4423,18 +3325,9 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { // CHECK-LABEL: @test_vqrdmulhq_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[VECINIT7_I]]) // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] // int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { @@ -4443,12 +3336,9 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { // CHECK-LABEL: @test_vqrdmulh_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] // int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { @@ -4457,14 +3347,9 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { // CHECK-LABEL: @test_vqrdmulhq_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VECINIT3_I]]) // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] // int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { @@ -4473,10 +3358,8 @@ int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { // CHECK-LABEL: @test_vmla_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i16> [[ADD_I]] @@ -4487,14 +3370,8 @@ int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vmlaq_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] @@ -4505,8 +3382,8 @@ int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vmla_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <2 x i32> [[ADD_I]] @@ -4517,10 +3394,8 @@ int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmlaq_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] @@ -4531,10 +3406,8 @@ int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmla_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i16> [[ADD_I]] @@ -4545,14 +3418,8 @@ uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { // CHECK-LABEL: @test_vmlaq_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <8 x i16> [[ADD_I]] @@ -4563,8 +3430,8 @@ uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { // CHECK-LABEL: @test_vmla_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <2 x i32> [[ADD_I]] @@ -4575,10 +3442,8 @@ uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { // CHECK-LABEL: @test_vmlaq_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] @@ -4589,13 +3454,9 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { // CHECK-LABEL: @test_vmlal_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4605,11 +3466,9 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vmlal_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4619,13 +3478,9 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmlal_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[ADD_I]] // @@ -4635,11 +3490,9 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { // CHECK-LABEL: @test_vmlal_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[ADD_I]] // @@ -4649,15 +3502,10 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { // CHECK-LABEL: @test_vqdmlal_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] // int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4666,13 +3514,10 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vqdmlal_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] // int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4681,10 +3526,8 @@ int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmls_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i16> [[SUB_I]] @@ -4695,14 +3538,8 @@ int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vmlsq_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I]] @@ -4713,8 +3550,8 @@ int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { // CHECK-LABEL: @test_vmls_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <2 x i32> [[SUB_I]] @@ -4725,10 +3562,8 @@ int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmlsq_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] @@ -4739,10 +3574,8 @@ int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { // CHECK-LABEL: @test_vmls_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i16> [[SUB_I]] @@ -4753,14 +3586,8 @@ uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { // CHECK-LABEL: @test_vmlsq_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B:%.*]], [[VECINIT7_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <8 x i16> [[SUB_I]] @@ -4771,8 +3598,8 @@ uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { // CHECK-LABEL: @test_vmls_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B:%.*]], [[VECINIT1_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <2 x i32> [[SUB_I]] @@ -4783,10 +3610,8 @@ uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { // CHECK-LABEL: @test_vmlsq_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B:%.*]], [[VECINIT3_I]] // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] @@ -4797,13 +3622,9 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { // CHECK-LABEL: @test_vmlsl_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4813,11 +3634,9 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vmlsl_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4827,13 +3646,9 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmlsl_n_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <4 x i32> [[SUB_I]] // @@ -4843,11 +3658,9 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { // CHECK-LABEL: @test_vmlsl_n_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) // CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]] // CHECK-NEXT: ret <2 x i64> [[SUB_I]] // @@ -4857,15 +3670,10 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { // CHECK-LABEL: @test_vqdmlsl_n_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] // int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { @@ -4874,13 +3682,10 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { // CHECK-LABEL: @test_vqdmlsl_n_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) -// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] // int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { @@ -4889,9 +3694,7 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmla_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -4902,9 +3705,7 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -4915,9 +3716,7 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -4928,9 +3727,7 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -4941,9 +3738,7 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -4954,9 +3749,7 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -4967,9 +3760,7 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -4980,9 +3771,7 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -4993,14 +3782,9 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5009,14 +3793,9 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5025,15 +3804,10 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5042,15 +3816,10 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5059,9 +3828,7 @@ int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -5072,9 +3839,7 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -5085,9 +3850,7 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -5098,9 +3861,7 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -5111,9 +3872,7 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -5124,9 +3883,7 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -5137,9 +3894,7 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -5150,9 +3905,7 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -5163,14 +3916,9 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5179,14 +3927,9 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5195,15 +3938,10 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5212,15 +3950,10 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5229,11 +3962,7 @@ int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmulh_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] // int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -5242,11 +3971,7 @@ int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmulhq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] // int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -5255,11 +3980,7 @@ int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmulh_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] // int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -5268,11 +3989,7 @@ int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmulhq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] // int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -5281,11 +3998,7 @@ int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vqrdmulh_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] // int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -5294,11 +4007,7 @@ int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqrdmulhq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 0) // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] // int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -5307,11 +4016,7 @@ int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqrdmulh_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] // int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -5320,11 +4025,7 @@ int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqrdmulhq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 0) +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 0) // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] // int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -5333,9 +4034,7 @@ int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmla_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -5346,9 +4045,7 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -5359,9 +4056,7 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -5372,9 +4067,7 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -5385,9 +4078,7 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] @@ -5398,9 +4089,7 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] @@ -5411,9 +4100,7 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] @@ -5424,9 +4111,7 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] @@ -5437,14 +4122,9 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5453,14 +4133,9 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5469,15 +4144,10 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5486,15 +4156,10 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5503,9 +4168,7 @@ int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -5516,9 +4179,7 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -5529,9 +4190,7 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -5542,9 +4201,7 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -5555,9 +4212,7 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] @@ -5568,9 +4223,7 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] @@ -5581,9 +4234,7 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] @@ -5594,9 +4245,7 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] @@ -5607,14 +4256,9 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { @@ -5623,14 +4267,9 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { @@ -5639,15 +4278,10 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { @@ -5656,15 +4290,10 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> // CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) -// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A:%.*]], <2 x i64> [[VQDMLAL2_I]]) // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { @@ -5673,11 +4302,7 @@ int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmulh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQDMULH_LANEQ_V]], <8 x i16> [[VQDMULH_LANEQ_V1]], i32 7) +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7) // CHECK-NEXT: ret <4 x i16> [[VQDMULH_LANEQ_V2]] // int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { @@ -5686,11 +4311,7 @@ int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmulhq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQDMULHQ_LANEQ_V]], <8 x i16> [[VQDMULHQ_LANEQ_V1]], i32 7) +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7) // CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_LANEQ_V2]] // int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { @@ -5699,11 +4320,7 @@ int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmulh_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQDMULH_LANEQ_V]], <4 x i32> [[VQDMULH_LANEQ_V1]], i32 3) +// CHECK-NEXT: [[VQDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3) // CHECK-NEXT: ret <2 x i32> [[VQDMULH_LANEQ_V2]] // int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { @@ -5712,11 +4329,7 @@ int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmulhq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQDMULHQ_LANEQ_V]], <4 x i32> [[VQDMULHQ_LANEQ_V1]], i32 3) +// CHECK-NEXT: [[VQDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3) // CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_LANEQ_V2]] // int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { @@ -5725,11 +4338,7 @@ int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vqrdmulh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[VQRDMULH_LANEQ_V]], <8 x i16> [[VQRDMULH_LANEQ_V1]], i32 7) +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v4i16.v8i16(<4 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7) // CHECK-NEXT: ret <4 x i16> [[VQRDMULH_LANEQ_V2]] // int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { @@ -5738,11 +4347,7 @@ int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqrdmulhq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[VQRDMULHQ_LANEQ_V]], <8 x i16> [[VQRDMULHQ_LANEQ_V1]], i32 7) +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.laneq.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[V:%.*]], i32 7) // CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_LANEQ_V2]] // int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { @@ -5751,11 +4356,7 @@ int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqrdmulh_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[VQRDMULH_LANEQ_V]], <4 x i32> [[VQRDMULH_LANEQ_V1]], i32 3) +// CHECK-NEXT: [[VQRDMULH_LANEQ_V2:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v2i32.v4i32(<2 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3) // CHECK-NEXT: ret <2 x i32> [[VQRDMULH_LANEQ_V2]] // int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { @@ -5764,11 +4365,7 @@ int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqrdmulhq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[VQRDMULHQ_LANEQ_V]], <4 x i32> [[VQRDMULHQ_LANEQ_V1]], i32 3) +// CHECK-NEXT: [[VQRDMULHQ_LANEQ_V2:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.laneq.v4i32.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[V:%.*]], i32 3) // CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_LANEQ_V2]] // int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) { diff --git a/clang/test/CodeGen/AArch64/neon-extract.c b/clang/test/CodeGen/AArch64/neon-extract.c index e5699f813131f..61312cfdc02b7 100644 --- a/clang/test/CodeGen/AArch64/neon-extract.c +++ b/clang/test/CodeGen/AArch64/neon-extract.c @@ -1,246 +1,244 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_s8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { return vext_s8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_s16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { return vext_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_s32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vext_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { return vext_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_s64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { return vext_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_s8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_s16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_s32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_s64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_u8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_u16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vext_u32(<2 x i32> noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vext_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_u64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_u8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_u16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vextq_u32(<4 x i32> noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vextq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_u64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vext_f32(<2 x float> noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -// CHECK: ret <2 x float> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x float> @test_vext_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[VEXT]] +// float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { return vext_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vext_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x double> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x double> @test_vext_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x double> [[A]] +// float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) { return vext_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vextq_f32(<4 x float> noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -// CHECK: ret <4 x float> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x float> @test_vextq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[VEXT]] +// float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vextq_f64(<2 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> -// CHECK: ret <2 x double> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x double> @test_vextq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[VEXT]] +// float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) { return vextq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vext_p8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vext_p16(<4 x i16> noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vext_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vextq_p8(<16 x i8> noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, 2); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vextq_p16(<8 x i16> noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vextq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, 3); } diff --git a/clang/test/CodeGen/AArch64/neon-fma.c b/clang/test/CodeGen/AArch64/neon-fma.c index b87c531b8b231..37c6aa8868305 100644 --- a/clang/test/CodeGen/AArch64/neon-fma.c +++ b/clang/test/CodeGen/AArch64/neon-fma.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,8 +8,8 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmla_n_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] // CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]] // CHECK-NEXT: ret <2 x float> [[ADD_I]] @@ -21,10 +21,8 @@ float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_n_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] // CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]] // CHECK-NEXT: ret <4 x float> [[ADD_I]] @@ -36,10 +34,8 @@ float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_n_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]] // CHECK-NEXT: ret <4 x float> [[SUB_I]] @@ -51,8 +47,8 @@ float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_n_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] // CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]] // CHECK-NEXT: ret <2 x float> [[SUB_I]] @@ -64,9 +60,7 @@ float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -78,9 +72,7 @@ float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -92,9 +84,7 @@ float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -106,9 +96,7 @@ float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -120,9 +108,7 @@ float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -134,9 +120,7 @@ float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -148,9 +132,7 @@ float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32_0 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -162,9 +144,7 @@ float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32_0 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -176,9 +156,7 @@ float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) // CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -190,9 +168,7 @@ float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -204,9 +180,7 @@ float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[ADD]] @@ -218,9 +192,7 @@ float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[ADD]] @@ -232,9 +204,7 @@ float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -246,9 +216,7 @@ float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[V]], <2 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -259,9 +227,7 @@ float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <2 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <2 x float> [[SUB]] @@ -273,9 +239,7 @@ float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[V]], <4 x float> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] // CHECK-NEXT: ret <4 x float> [[SUB]] @@ -287,13 +251,10 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f64 // CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) -// CHECK-NEXT: ret <2 x double> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { return vfmaq_n_f64(a, b, c); @@ -303,13 +264,10 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { // CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[B]] -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) -// CHECK-NEXT: ret <2 x double> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] // float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { return vfmsq_n_f64(a, b, c); diff --git a/clang/test/CodeGen/AArch64/neon-fp16fml.c b/clang/test/CodeGen/AArch64/neon-fp16fml.c index 976045d6e79f3..d7687ddca2888 100644 --- a/clang/test/CodeGen/AArch64/neon-fp16fml.c +++ b/clang/test/CodeGen/AArch64/neon-fp16fml.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target @@ -12,10 +12,7 @@ // CHECK-LABEL: @test_vfmlal_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -24,10 +21,7 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlsl_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -36,10 +30,7 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlal_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -48,10 +39,7 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlsl_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -60,10 +48,7 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: @test_vfmlalq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -72,10 +57,7 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlslq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -84,10 +66,7 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlalq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -96,10 +75,7 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlslq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -110,42 +86,8 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: @test_vfmlal_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -154,42 +96,8 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c // CHECK-LABEL: @test_vfmlal_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -198,74 +106,8 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t // CHECK-LABEL: @test_vfmlalq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -274,74 +116,8 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlalq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -350,42 +126,8 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlal_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]] // float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -394,42 +136,8 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlal_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]] // float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -438,74 +146,8 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlalq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]] // float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -514,74 +156,8 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t // CHECK-LABEL: @test_vfmlalq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]] // float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -590,42 +166,8 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_ // CHECK-LABEL: @test_vfmlsl_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 0 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -634,42 +176,8 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c // CHECK-LABEL: @test_vfmlsl_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 1 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) { @@ -678,74 +186,8 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t // CHECK-LABEL: @test_vfmlslq_lane_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 2 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -754,74 +196,8 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlslq_lane_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8474:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_8475:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84714:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84715:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84724:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84725:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84734:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84735:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84744:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84745:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84754:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84755:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_84764:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_84765:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <4 x half> [[C:%.*]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__REINT_8474]], align 8 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE8]], ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8475]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT_84714]], align 8 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE18]], ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_84715]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[__REINT_84724]], align 8 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x i16> [[TMP6]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE28]], ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_84725]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load <4 x i16>, ptr [[__REINT_84734]], align 8 -// CHECK-NEXT: [[VGET_LANE38:%.*]] = extractelement <4 x i16> [[TMP8]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE38]], ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_84735]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[__REINT_84744]], align 8 -// CHECK-NEXT: [[VGET_LANE48:%.*]] = extractelement <4 x i16> [[TMP10]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE48]], ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_84745]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[__REINT_84754]], align 8 -// CHECK-NEXT: [[VGET_LANE58:%.*]] = extractelement <4 x i16> [[TMP12]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE58]], ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_84755]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <4 x half> [[C]], ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[__REINT_84764]], align 8 -// CHECK-NEXT: [[VGET_LANE68:%.*]] = extractelement <4 x i16> [[TMP14]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE68]], ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_84765]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <4 x half> [[C:%.*]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) { @@ -830,42 +206,8 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t // CHECK-LABEL: @test_vfmlsl_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]] // float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -874,42 +216,8 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlsl_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <4 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <4 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) +// CHECK-NEXT: [[VECINIT31:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[VECINIT31]]) // CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]] // float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) { @@ -918,74 +226,8 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t // CHECK-LABEL: @test_vfmlslq_laneq_low_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 6 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]] // float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) { @@ -994,74 +236,8 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t // CHECK-LABEL: @test_vfmlslq_laneq_high_f16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_8504:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_8505:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85014:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85015:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85024:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85025:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85034:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85035:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85044:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85045:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85054:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85055:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[__REINT_85064:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_85065:%.*]] = alloca i16, align 2 -// CHECK-NEXT: store <8 x half> [[C:%.*]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP1]], i32 0 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__REINT_8504]], align 16 -// CHECK-NEXT: [[VGETQ_LANE8:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE8]], ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[TMP3:%.*]] = load half, ptr [[__REINT1_8505]], align 2 -// CHECK-NEXT: [[VECINIT11:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP3]], i32 1 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT_85014]], align 16 -// CHECK-NEXT: [[VGETQ_LANE18:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE18]], ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[TMP5:%.*]] = load half, ptr [[__REINT1_85015]], align 2 -// CHECK-NEXT: [[VECINIT21:%.*]] = insertelement <8 x half> [[VECINIT11]], half [[TMP5]], i32 2 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[__REINT_85024]], align 16 -// CHECK-NEXT: [[VGETQ_LANE28:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE28]], ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[TMP7:%.*]] = load half, ptr [[__REINT1_85025]], align 2 -// CHECK-NEXT: [[VECINIT31:%.*]] = insertelement <8 x half> [[VECINIT21]], half [[TMP7]], i32 3 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[TMP8:%.*]] = load <8 x i16>, ptr [[__REINT_85034]], align 16 -// CHECK-NEXT: [[VGETQ_LANE38:%.*]] = extractelement <8 x i16> [[TMP8]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE38]], ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[TMP9:%.*]] = load half, ptr [[__REINT1_85035]], align 2 -// CHECK-NEXT: [[VECINIT41:%.*]] = insertelement <8 x half> [[VECINIT31]], half [[TMP9]], i32 4 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[__REINT_85044]], align 16 -// CHECK-NEXT: [[VGETQ_LANE48:%.*]] = extractelement <8 x i16> [[TMP10]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE48]], ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[TMP11:%.*]] = load half, ptr [[__REINT1_85045]], align 2 -// CHECK-NEXT: [[VECINIT51:%.*]] = insertelement <8 x half> [[VECINIT41]], half [[TMP11]], i32 5 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[__REINT_85054]], align 16 -// CHECK-NEXT: [[VGETQ_LANE58:%.*]] = extractelement <8 x i16> [[TMP12]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE58]], ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[TMP13:%.*]] = load half, ptr [[__REINT1_85055]], align 2 -// CHECK-NEXT: [[VECINIT61:%.*]] = insertelement <8 x half> [[VECINIT51]], half [[TMP13]], i32 6 -// CHECK-NEXT: store <8 x half> [[C]], ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[__REINT_85064]], align 16 -// CHECK-NEXT: [[VGETQ_LANE68:%.*]] = extractelement <8 x i16> [[TMP14]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE68]], ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[TMP15:%.*]] = load half, ptr [[__REINT1_85065]], align 2 -// CHECK-NEXT: [[VECINIT71:%.*]] = insertelement <8 x half> [[VECINIT61]], half [[TMP15]], i32 7 -// CHECK-NEXT: [[TMP16:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8> -// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) +// CHECK-NEXT: [[VECINIT71:%.*]] = shufflevector <8 x half> [[C:%.*]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[VECINIT71]]) // CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]] // float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) { diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c index 15ae7eea820e8..d8300a0df6d32 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics-constrained.c @@ -1,12 +1,13 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ -// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefixes=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ // RUN: -ffp-exception-behavior=strict \ -// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefixes=COMMON,COMMONIR,CONSTRAINED %s +// RUN: -flax-vector-conversions=none -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefixes=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -14,804 +15,1523 @@ #include -// COMMON-LABEL: test_vadd_f32 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2 -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vadd_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3:[0-9]+]] +// CONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) { return vadd_f32(v1, v2); } -// COMMON-LABEL: test_vaddq_f32 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2 -// CONSTRAINED: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) { return vaddq_f32(v1, v2); } -// COMMON-LABEL: test_vsub_f32 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vsub_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) { return vsub_f32(v1, v2); } -// COMMON-LABEL: test_vsubq_f32 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) { return vsubq_f32(v1, v2); } -// COMMON-LABEL: test_vsubq_f64 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2 -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) { return vsubq_f64(v1, v2); } -// COMMON-LABEL: test_vmul_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmul_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) { return vmul_f32(v1, v2); } -// COMMON-LABEL: test_vmulq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) { return vmulq_f32(v1, v2); } -// COMMON-LABEL: test_vmulq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[MUL_I]] +// float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) { return vmulq_f64(v1, v2); } -// COMMON-LABEL: test_vmla_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmla_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmla_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlaq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fadd.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlaq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlaq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlaq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vmls_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %v2, <2 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %v1, <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vmls_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> [[V1]], <2 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmls_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlsq_f32 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> %v2, <4 x float> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> %v1, <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fmul.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fsub.v4f32(<4 x float> [[V1]], <4 x float> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlsq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vmlsq_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CONSTRAINED: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> %v2, <2 x double> %v3, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> %v1, <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fmul.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fsub.v2f64(<2 x double> [[V1]], <2 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlsq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vfma_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]]) +// UNCONSTRAINED-NEXT: ret <2 x float> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfma_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmaq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmaq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmaq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmaq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vfms_f32 -// COMMONIR: [[SUB_I:%.*]] = fneg <2 x float> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]]) +// UNCONSTRAINED-NEXT: ret <2 x float> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vfms_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfms_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmsq_f32 -// COMMONIR: [[SUB_I:%.*]] = fneg <4 x float> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmsq_f32(v1, v2, v3); } -// COMMON-LABEL: test_vfmsq_f64 -// COMMONIR: [[SUB_I:%.*]] = fneg <2 x double> %v2 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) -// CONSTRAINED: [[TMP3:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmsq_f64(v1, v2, v3); } -// COMMON-LABEL: test_vdivq_f64 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x double> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x double> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x double> [[DIV_I]] +// float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) { return vdivq_f64(v1, v2); } -// COMMON-LABEL: test_vdivq_f32 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <4 x float> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <4 x float> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <4 x float> @llvm.experimental.constrained.fdiv.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <4 x float> [[DIV_I]] +// float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) { return vdivq_f32(v1, v2); } -// COMMON-LABEL: test_vdiv_f32 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2 -// CONSTRAINED: [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <2 x float> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: ret <2 x float> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <2 x float> [[DIV_I]] +// float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) { return vdiv_f32(v1, v2); } -// COMMON-LABEL: test_vceq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) { return vceq_f32(v1, v2); } -// COMMON-LABEL: test_vceq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) { return vceq_f64(a, b); } -// COMMON-LABEL: test_vceqq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) { return vceqq_f32(v1, v2); } -// COMMON-LABEL: test_vceqq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) { return vceqq_f64(v1, v2); } -// COMMON-LABEL: test_vcge_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) { return vcge_f32(v1, v2); } -// COMMON-LABEL: test_vcge_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) { return vcge_f64(a, b); } -// COMMON-LABEL: test_vcgeq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) { return vcgeq_f32(v1, v2); } -// COMMON-LABEL: test_vcgeq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) { return vcgeq_f64(v1, v2); } -// COMMON-LABEL: test_vcle_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) { return vcle_f32(v1, v2); } -// COMMON-LABEL: test_vcle_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) { return vcle_f64(a, b); } -// COMMON-LABEL: test_vcleq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) { return vcleq_f32(v1, v2); } -// COMMON-LABEL: test_vcleq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) { return vcleq_f64(v1, v2); } -// COMMON-LABEL: test_vcgt_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) { return vcgt_f32(v1, v2); } -// COMMON-LABEL: test_vcgt_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) { return vcgt_f64(a, b); } -// COMMON-LABEL: test_vcgtq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) { return vcgtq_f32(v1, v2); } -// COMMON-LABEL: test_vcgtq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) { return vcgtq_f64(v1, v2); } -// COMMON-LABEL: test_vclt_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %v1, <2 x float> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// COMMONIR: ret <2 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// UNCONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CONSTRAINED-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) { return vclt_f32(v1, v2); } -// COMMON-LABEL: test_vclt_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b -// CONSTRAINED: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %a, <1 x double> %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// COMMONIR: ret <1 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// UNCONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CONSTRAINED-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) { return vclt_f64(a, b); } -// COMMON-LABEL: test_vcltq_f32 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %v1, <4 x float> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// COMMONIR: ret <4 x i32> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// UNCONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CONSTRAINED-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) { return vcltq_f32(v1, v2); } -// COMMON-LABEL: test_vcltq_f64 -// UNCONSTRAINED: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2 -// CONSTRAINED: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %v1, <2 x double> %v2, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// COMMONIR: ret <2 x i64> [[SEXT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]] +// UNCONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// UNCONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CMP_I:%.*]] = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CONSTRAINED-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) { return vcltq_f64(v1, v2); } -// COMMON-LABEL: test_vpadds_f32 -// COMMONIR: [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0 -// COMMONIR: [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1 -// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] -// CONSTRAINED: [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict" -// COMMONIR: ret float [[VPADDD_I]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vpadds_f32( +// UNCONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// UNCONSTRAINED-NEXT: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] +// UNCONSTRAINED-NEXT: ret float [[VPADDD_I]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vpadds_f32( +// CONSTRAINED-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// CONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// CONSTRAINED-NEXT: [[VPADDD_I:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LANE0_I]], float [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[VPADDD_I]] +// float32_t test_vpadds_f32(float32x2_t a) { return vpadds_f32(a); } -// COMMON-LABEL: test_vpaddd_f64 -// COMMONIR: [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0 -// COMMONIR: [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1 -// UNCONSTRAINED: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] -// CONSTRAINED: [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[VPADDD_I]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// UNCONSTRAINED-NEXT: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] +// UNCONSTRAINED-NEXT: ret double [[VPADDD_I]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vpaddd_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// CONSTRAINED-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// CONSTRAINED-NEXT: [[VPADDD_I:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LANE0_I]], double [[LANE1_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[VPADDD_I]] +// float64_t test_vpaddd_f64(float64x2_t a) { return vpaddd_f64(a); } -// COMMON-LABEL: test_vcvts_f32_s32 -// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i32 %a to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret float [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32( +// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = sitofp i32 [[A]] to float +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_s32( +// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_s32(int32_t a) { return vcvts_f32_s32(a); } -// COMMON-LABEL: test_vcvtd_f64_s64 -// UNCONSTRAINED: [[TMP0:%.*]] = sitofp i64 %a to double -// CONSTRAINED: [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64( +// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = sitofp i64 [[A]] to double +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_s64( +// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_s64(int64_t a) { return vcvtd_f64_s64(a); } -// COMMON-LABEL: test_vcvts_f32_u32 -// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i32 %a to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret float [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32( +// UNCONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = uitofp i32 [[A]] to float +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vcvts_f32_u32( +// CONSTRAINED-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_u32(uint32_t a) { return vcvts_f32_u32(a); } // XXX should verify the type of registers -// COMMON-LABEL: test_vcvtd_f64_u64 -// UNCONSTRAINED: [[TMP0:%.*]] = uitofp i64 %a to double -// CONSTRAINED: [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret double [[TMP0]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64( +// UNCONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = uitofp i64 [[A]] to double +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vcvtd_f64_u64( +// CONSTRAINED-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_u64(uint64_t a) { return vcvtd_f64_u64(a); } -// COMMON-LABEL: test_vceqs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vceqs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vceqs_f32(float32_t a, float32_t b) { return (uint32_t)vceqs_f32(a, b); } -// COMMON-LABEL: test_vceqd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double %b, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vceqd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double [[B]], metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vceqd_f64(float64_t a, float64_t b) { return (uint64_t)vceqd_f64(a, b); } -// COMMON-LABEL: test_vceqzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCEQZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCEQZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vceqzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f32(float [[A]], float 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCEQZ_I]] +// uint32_t test_vceqzs_f32(float32_t a) { return (uint32_t)vceqzs_f32(a); } -// COMMON-LABEL: test_vceqzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double %a, double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") -// COMMONIR: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCEQZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCEQZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vceqzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmp.f64(double [[A]], double 0.000000e+00, metadata !"oeq", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_f64(float64_t a) { return (uint64_t)vceqzd_f64(a); } -// COMMON-LABEL: test_vcges_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcges_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcges_f32(float32_t a, float32_t b) { return (uint32_t)vcges_f32(a, b); } -// COMMON-LABEL: test_vcged_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcged_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcged_f64(float64_t a, float64_t b) { return (uint64_t)vcged_f64(a, b); } -// COMMON-LABEL: test_vcgezs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCGEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCGEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgezs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCGEZ_I]] +// uint32_t test_vcgezs_f32(float32_t a) { return (uint32_t)vcgezs_f32(a); } -// COMMON-LABEL: test_vcgezd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") -// COMMONIR: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCGEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCGEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgezd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"oge", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_f64(float64_t a) { return (uint64_t)vcgezd_f64(a); } -// COMMON-LABEL: test_vcgts_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgts_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcgts_f32(float32_t a, float32_t b) { return (uint32_t)vcgts_f32(a, b); } -// COMMON-LABEL: test_vcgtd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcgtd_f64(float64_t a, float64_t b) { return (uint64_t)vcgtd_f64(a, b); } -// COMMON-LABEL: test_vcgtzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCGTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCGTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcgtzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCGTZ_I]] +// uint32_t test_vcgtzs_f32(float32_t a) { return (uint32_t)vcgtzs_f32(a); } -// COMMON-LABEL: test_vcgtzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") -// COMMONIR: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCGTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCGTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcgtzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ogt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_f64(float64_t a) { return (uint64_t)vcgtzd_f64(a); } -// COMMON-LABEL: test_vcles_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcles_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcles_f32(float32_t a, float32_t b) { return (uint32_t)vcles_f32(a, b); } -// COMMON-LABEL: test_vcled_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcled_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcled_f64(float64_t a, float64_t b) { return (uint64_t)vcled_f64(a, b); } -// COMMON-LABEL: test_vclezs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCLEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCLEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vclezs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCLEZ_I]] +// uint32_t test_vclezs_f32(float32_t a) { return (uint32_t)vclezs_f32(a); } -// COMMON-LABEL: test_vclezd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") -// COMMONIR: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCLEZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCLEZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vclezd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_f64(float64_t a) { return (uint64_t)vclezd_f64(a); } -// COMMON-LABEL: test_vclts_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vclts_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vclts_f32(float32_t a, float32_t b) { return (uint32_t)vclts_f32(a, b); } -// COMMON-LABEL: test_vcltd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, %b -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double %b, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCMPD_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], [[B]] +// UNCONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcltd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double [[B]], metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcltd_f64(float64_t a, float64_t b) { return (uint64_t)vcltd_f64(a, b); } -// COMMON-LABEL: test_vcltzs_f32 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// COMMONIR: ret i32 [[VCLTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// UNCONSTRAINED-NEXT: ret i32 [[VCLTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i32 @test_vcltzs_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[A]], float 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CONSTRAINED-NEXT: ret i32 [[VCLTZ_I]] +// uint32_t test_vcltzs_f32(float32_t a) { return (uint32_t)vcltzs_f32(a); } -// COMMON-LABEL: test_vcltzd_f64 -// UNCONSTRAINED: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00 -// CONSTRAINED: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double %a, double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") -// COMMONIR: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// COMMONIR: ret i64 [[VCLTZ_I]] +// UNCONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00 +// UNCONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// UNCONSTRAINED-NEXT: ret i64 [[VCLTZ_I]] +// +// CONSTRAINED-LABEL: define dso_local i64 @test_vcltzd_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f64(double [[A]], double 0.000000e+00, metadata !"olt", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CONSTRAINED-NEXT: ret i64 [[VCLTZ_I]] +// uint64_t test_vcltzd_f64(float64_t a) { return (uint64_t)vcltzd_f64(a); } -// COMMON-LABEL: test_vadd_f64 -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, %b -// CONSTRAINED: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vadd_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) { return vadd_f64(a, b); } -// COMMON-LABEL: test_vmul_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %a, %b -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[MUL_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[MUL_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmul_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[MUL_I]] +// float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) { return vmul_f64(a, b); } -// COMMON-LABEL: test_vdiv_f64 -// UNCONSTRAINED: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b -// CONSTRAINED: [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[DIV_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[DIV_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[DIV_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fdiv.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[DIV_I]] +// float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) { return vdiv_f64(a, b); } -// COMMON-LABEL: test_vmla_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]] -// CONSTRAINED: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[ADD_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// UNCONSTRAINED-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmla_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[ADD_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fadd.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmla_f64(a, b, c); } -// COMMON-LABEL: test_vmls_f64 -// UNCONSTRAINED: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CONSTRAINED: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> %b, <1 x double> %c, metadata !"round.tonearest", metadata !"fpexcept.strict") -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]] -// CONSTRAINED: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vmls_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[MUL_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fmul.v1f64(<1 x double> [[B]], <1 x double> [[C]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[MUL_I]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmls_f64(a, b, c); } -// COMMON-LABEL: test_vfma_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) -// CONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfma_f64(a, b, c); } -// COMMON-LABEL: test_vfms_f64 -// COMMONIR: [[SUB_I:%.*]] = fneg <1 x double> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// UNCONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) -// CONSTRAINED: [[TMP3:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[TMP3]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfms_f64(a, b, c); } -// COMMON-LABEL: test_vsub_f64 -// UNCONSTRAINED: [[SUB_I:%.*]] = fsub <1 x double> %a, %b -// CONSTRAINED: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> %a, <1 x double> %b, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[SUB_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]] +// UNCONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsub_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[SUB_I:%.*]] = call <1 x double> @llvm.experimental.constrained.fsub.v1f64(<1 x double> [[A]], <1 x double> [[B]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { return vsub_f64(a, b); } -// COMMON-LABEL: test_vcvt_s64_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a) -// COMMONIR: ret <1 x i64> [[TMP1]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); } -// COMMON-LABEL: test_vcvt_u64_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a) -// COMMONIR: ret <1 x i64> [[TMP1]] +// UNCONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]]) #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); } -// COMMON-LABEL: test_vcvt_f64_s64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// UNCONSTRAINED: [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double> -// CONSTRAINED: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VCVT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VCVT_I:%.*]] = sitofp <1 x i64> [[A]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sitofp.v1f64.v1i64(<1 x i64> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_s64(int64x1_t a) { return vcvt_f64_s64(a); } -// COMMON-LABEL: test_vcvt_f64_u64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// UNCONSTRAINED: [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double> -// CONSTRAINED: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VCVT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// UNCONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VCVT_I:%.*]] = uitofp <1 x i64> [[A]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// CONSTRAINED-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VCVT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.uitofp.v1f64.v1i64(<1 x i64> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_u64(uint64x1_t a) { return vcvt_f64_u64(a); } -// COMMON-LABEL: test_vrnda_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDA1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDA1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDA1_I]] +// float64x1_t test_vrnda_f64(float64x1_t a) { return vrnda_f64(a); } -// COMMON-LABEL: test_vrndp_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDP1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDP1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDP1_I]] +// float64x1_t test_vrndp_f64(float64x1_t a) { return vrndp_f64(a); } -// COMMON-LABEL: test_vrndm_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDM1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDM1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDM1_I]] +// float64x1_t test_vrndm_f64(float64x1_t a) { return vrndm_f64(a); } -// COMMON-LABEL: test_vrndx_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDX1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDX1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.rint.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDX1_I]] +// float64x1_t test_vrndx_f64(float64x1_t a) { return vrndx_f64(a); } -// COMMON-LABEL: test_vrnd_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> %a, metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDZ1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDZ1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDZ1_I]] +// float64x1_t test_vrnd_f64(float64x1_t a) { return vrnd_f64(a); } -// COMMON-LABEL: test_vrndi_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VRNDI1_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.experimental.constrained.nearbyint.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// float64x1_t test_vrndi_f64(float64x1_t a) { return vrndi_f64(a); } -// COMMON-LABEL: test_vsqrt_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// COMMONIR: ret <1 x double> [[VSQRT_I]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.experimental.constrained.sqrt.v1f64(<1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR3]] +// CONSTRAINED-NEXT: ret <1 x double> [[VSQRT_I]] +// float64x1_t test_vsqrt_f64(float64x1_t a) { return vsqrt_f64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c index 271ae056308d2..3cfa0ac3aa6b1 100644 --- a/clang/test/CodeGen/AArch64/neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c @@ -1,17418 +1,22471 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone \ // RUN: -flax-vector-conversions=none -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vadd_s8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) { return vadd_s8(v1, v2); } -// CHECK-LABEL: @test_vadd_s16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) { return vadd_s16(v1, v2); } -// CHECK-LABEL: @test_vadd_s32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) { return vadd_s32(v1, v2); } -// CHECK-LABEL: @test_vadd_s64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) { return vadd_s64(v1, v2); } -// CHECK-LABEL: @test_vadd_f32( -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vadd_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) { return vadd_f32(v1, v2); } -// CHECK-LABEL: @test_vadd_u8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) { return vadd_u8(v1, v2); } -// CHECK-LABEL: @test_vadd_u16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) { return vadd_u16(v1, v2); } -// CHECK-LABEL: @test_vadd_u32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) { return vadd_u32(v1, v2); } -// CHECK-LABEL: @test_vadd_u64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vadd_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) { return vadd_u64(v1, v2); } -// CHECK-LABEL: @test_vaddq_s8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) { return vaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vaddq_s16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) { return vaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vaddq_s32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddq_s32(int32x4_t v1, int32x4_t v2) { return vaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vaddq_s64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) { return vaddq_s64(v1, v2); } -// CHECK-LABEL: @test_vaddq_f32( -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vaddq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) { return vaddq_f32(v1, v2); } -// CHECK-LABEL: @test_vaddq_f64( -// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vaddq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) { return vaddq_f64(v1, v2); } -// CHECK-LABEL: @test_vaddq_u8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vaddq_u16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vaddq_u32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vaddq_u64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) { return vaddq_u64(v1, v2); } -// CHECK-LABEL: @test_vsub_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) { return vsub_s8(v1, v2); } -// CHECK-LABEL: @test_vsub_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) { return vsub_s16(v1, v2); } -// CHECK-LABEL: @test_vsub_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) { return vsub_s32(v1, v2); } -// CHECK-LABEL: @test_vsub_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) { return vsub_s64(v1, v2); } -// CHECK-LABEL: @test_vsub_f32( -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vsub_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) { return vsub_f32(v1, v2); } -// CHECK-LABEL: @test_vsub_u8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsub_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) { return vsub_u8(v1, v2); } -// CHECK-LABEL: @test_vsub_u16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsub_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) { return vsub_u16(v1, v2); } -// CHECK-LABEL: @test_vsub_u32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsub_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) { return vsub_u32(v1, v2); } -// CHECK-LABEL: @test_vsub_u64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2 -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsub_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) { return vsub_u64(v1, v2); } -// CHECK-LABEL: @test_vsubq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) { return vsubq_s8(v1, v2); } -// CHECK-LABEL: @test_vsubq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) { return vsubq_s16(v1, v2); } -// CHECK-LABEL: @test_vsubq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubq_s32(int32x4_t v1, int32x4_t v2) { return vsubq_s32(v1, v2); } -// CHECK-LABEL: @test_vsubq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) { return vsubq_s64(v1, v2); } -// CHECK-LABEL: @test_vsubq_f32( -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vsubq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) { return vsubq_f32(v1, v2); } -// CHECK-LABEL: @test_vsubq_f64( -// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vsubq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) { return vsubq_f64(v1, v2); } -// CHECK-LABEL: @test_vsubq_u8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) { return vsubq_u8(v1, v2); } -// CHECK-LABEL: @test_vsubq_u16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) { return vsubq_u16(v1, v2); } -// CHECK-LABEL: @test_vsubq_u32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) { return vsubq_u32(v1, v2); } -// CHECK-LABEL: @test_vsubq_u64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2 -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) { return vsubq_u64(v1, v2); } -// CHECK-LABEL: @test_vmul_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) { return vmul_s8(v1, v2); } -// CHECK-LABEL: @test_vmul_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) { return vmul_s16(v1, v2); } -// CHECK-LABEL: @test_vmul_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) { return vmul_s32(v1, v2); } -// CHECK-LABEL: @test_vmul_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmul_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) { return vmul_f32(v1, v2); } -// CHECK-LABEL: @test_vmul_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2 -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) { return vmul_u8(v1, v2); } -// CHECK-LABEL: @test_vmul_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2 -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmul_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) { return vmul_u16(v1, v2); } -// CHECK-LABEL: @test_vmul_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2 -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmul_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) { return vmul_u32(v1, v2); } -// CHECK-LABEL: @test_vmulq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) { return vmulq_s8(v1, v2); } -// CHECK-LABEL: @test_vmulq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) { return vmulq_s16(v1, v2); } -// CHECK-LABEL: @test_vmulq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) { return vmulq_s32(v1, v2); } -// CHECK-LABEL: @test_vmulq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2 -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) { return vmulq_u8(v1, v2); } -// CHECK-LABEL: @test_vmulq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2 -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmulq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) { return vmulq_u16(v1, v2); } -// CHECK-LABEL: @test_vmulq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2 -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmulq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) { return vmulq_u32(v1, v2); } -// CHECK-LABEL: @test_vmulq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmulq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) { return vmulq_f32(v1, v2); } -// CHECK-LABEL: @test_vmulq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[MUL_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmulq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[MUL_I]] +// float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) { return vmulq_f64(v1, v2); } -// CHECK-LABEL: @test_vmul_p8( -// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VMUL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmul_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VMUL_V_I]] +// poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) { return vmul_p8(v1, v2); } -// CHECK-LABEL: @test_vmulq_p8( -// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VMULQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmulq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VMULQ_V_I]] +// poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) { return vmulq_p8(v1, v2); } -// CHECK-LABEL: @test_vmla_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vmla_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vmla_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vmla_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmla_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmla_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmla_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vmla_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmla_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vmla_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmla_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmla_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vmla_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vmlaq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vmlaq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vmlaq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmlaq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlaq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlaq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vmlaq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlaq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vmlaq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vmlaq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlaq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CHECK: [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]] -// CHECK: ret <2 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmlaq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x double> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x double> [[ADD_I]] +// float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlaq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vmls_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vmls_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vmls_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmls_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vmls_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmls_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vmls_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmls_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vmls_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmls_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmls_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vmls_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vmlsq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vmlsq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vmlsq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmlsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vmlsq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmlsq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vmlsq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vmlsq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vmlsq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vmlsq_f64( -// CHECK: [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3 -// CHECK: [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]] -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmlsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x double> [[V2]], [[V3]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x double> [[V1]], [[MUL_I]] +// CHECK-NEXT: ret <2 x double> [[SUB_I]] +// float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vmlsq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vfma_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %v2, <2 x float> %v3, <2 x float> %v1) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x float> @test_vfma_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[V2]], <2 x float> [[V3]], <2 x float> [[V1]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfma_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmaq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %v2, <4 x float> %v3, <4 x float> %v1) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x float> @test_vfmaq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[V2]], <4 x float> [[V3]], <4 x float> [[V1]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmaq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmaq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %v3, <2 x double> %v1) -// CHECK: ret <2 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x double> @test_vfmaq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[V2]], <2 x double> [[V3]], <2 x double> [[V1]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmaq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vfms_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %v3, <2 x float> %v1) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x float> @test_vfms_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[V3]], <2 x float> [[V1]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) { return vfms_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmsq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %v3, <4 x float> %v1) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x float> @test_vfmsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[V3]], <4 x float> [[V1]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) { return vfmsq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vfmsq_f64( -// CHECK: [[SUB_I:%.*]] = fneg <2 x double> %v2 -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> %v3, <2 x double> %v1) -// CHECK: ret <2 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x double> @test_vfmsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[V3]], <2 x double> [[V1]]) +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) { return vfmsq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vdivq_f64( -// CHECK: [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2 -// CHECK: ret <2 x double> [[DIV_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vdivq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x double> [[DIV_I]] +// float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) { return vdivq_f64(v1, v2); } -// CHECK-LABEL: @test_vdivq_f32( -// CHECK: [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2 -// CHECK: ret <4 x float> [[DIV_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vdivq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <4 x float> [[DIV_I]] +// float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) { return vdivq_f32(v1, v2); } -// CHECK-LABEL: @test_vdiv_f32( -// CHECK: [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2 -// CHECK: ret <2 x float> [[DIV_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vdiv_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: ret <2 x float> [[DIV_I]] +// float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) { return vdiv_f32(v1, v2); } -// CHECK-LABEL: @test_vaba_s8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) { return vaba_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[V2]], <4 x i16> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) { return vaba_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[V2]], <2 x i32> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) { return vaba_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaba_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V2]], <8 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vaba_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v2, <4 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaba_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[V2]], <4 x i16> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vaba_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vaba_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v2, <2 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaba_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[V2]], <2 x i32> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vaba_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s8( -// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) { return vabaq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[V2]], <8 x i16> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) { return vabaq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[V2]], <4 x i32> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) { return vabaq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u8( -// CHECK: [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabaq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V2]], <16 x i8> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[V1]], [[VABD_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vabaq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v2, <8 x i16> %v3) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabaq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[V2]], <8 x i16> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vabaq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vabaq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v2, <4 x i32> %v3) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabaq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[V2]], <4 x i32> [[V3]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[V1]], [[VABD2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) { return vabaq_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vabd_s8( -// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_I]] +// int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) { return vabd_s8(v1, v2); } -// CHECK-LABEL: @test_vabd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: ret <4 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VABD2_I]] +// int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) { return vabd_s16(v1, v2); } -// CHECK-LABEL: @test_vabd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: ret <2 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VABD2_I]] +// int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) { return vabd_s32(v1, v2); } -// CHECK-LABEL: @test_vabd_u8( -// CHECK: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_I]] +// uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) { return vabd_u8(v1, v2); } -// CHECK-LABEL: @test_vabd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: ret <4 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VABD2_I]] +// uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) { return vabd_u16(v1, v2); } -// CHECK-LABEL: @test_vabd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: ret <2 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VABD2_I]] +// uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) { return vabd_u32(v1, v2); } -// CHECK-LABEL: @test_vabd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x float> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vabd_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[V1]], <2 x float> [[V2]]) +// CHECK-NEXT: ret <2 x float> [[VABD2_I]] +// float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) { return vabd_f32(v1, v2); } -// CHECK-LABEL: @test_vabdq_s8( -// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VABD_I]] +// int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) { return vabdq_s8(v1, v2); } -// CHECK-LABEL: @test_vabdq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: ret <8 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VABD2_I]] +// int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) { return vabdq_s16(v1, v2); } -// CHECK-LABEL: @test_vabdq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: ret <4 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VABD2_I]] +// int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) { return vabdq_s32(v1, v2); } -// CHECK-LABEL: @test_vabdq_u8( -// CHECK: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VABD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabdq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VABD_I]] +// uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) { return vabdq_u8(v1, v2); } -// CHECK-LABEL: @test_vabdq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: ret <8 x i16> [[VABD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VABD2_I]] +// uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) { return vabdq_u16(v1, v2); } -// CHECK-LABEL: @test_vabdq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: ret <4 x i32> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VABD2_I]] +// uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) { return vabdq_u32(v1, v2); } -// CHECK-LABEL: @test_vabdq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x float> [[VABD2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vabdq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[V1]], <4 x float> [[V2]]) +// CHECK-NEXT: ret <4 x float> [[VABD2_I]] +// float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) { return vabdq_f32(v1, v2); } -// CHECK-LABEL: @test_vabdq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x double> [[VABD2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vabdq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[V1]], <2 x double> [[V2]]) +// CHECK-NEXT: ret <2 x double> [[VABD2_I]] +// float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) { return vabdq_f64(v1, v2); } -// CHECK-LABEL: @test_vbsl_s8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) { return vbsl_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP1]] +// int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) { return (int8x8_t)vbsl_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i32> [[VBSL5_I]] +// int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) { return vbsl_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_s64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// int64x1_t test_vbsl_s64(uint64x1_t v1, int64x1_t v2, int64x1_t v3) { return vbsl_s64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) { return vbsl_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i16> [[VBSL5_I]] +// uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) { return vbsl_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vbsl_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]], <2 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i32> [[VBSL5_I]] +// uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) { return vbsl_u32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_u64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x i64> noundef [[V2:%.*]], <1 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) { return vbsl_u64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> -// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i32> %v1, [[VBSL1_I]] -// CHECK: [[TMP4:%.*]] = xor <2 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> -// CHECK: ret <2 x float> [[TMP5]] +// CHECK-LABEL: define dso_local <2 x float> @test_vbsl_f32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]], <2 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <2 x float> [[V2]] to <2 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <2 x float> [[V3]] to <2 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i32> [[V1]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i32> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP1]] +// float32x2_t test_vbsl_f32(uint32x2_t v1, float32x2_t v2, float32x2_t v3) { return vbsl_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double> -// CHECK: ret <1 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x double> @test_vbsl_f64( +// CHECK-SAME: <1 x i64> noundef [[V1:%.*]], <1 x double> noundef [[V2:%.*]], <1 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <1 x double> [[V2]] to <1 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <1 x double> [[V3]] to <1 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[V1]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) { return vbsl_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_p8( -// CHECK: [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <8 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <8 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]], <8 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) { return vbsl_p8(v1, v2, v3); } -// CHECK-LABEL: @test_vbsl_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vbsl_p16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]], <4 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i16> [[VBSL5_I]] +// poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) { return vbsl_p16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) { return vbslq_s8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) { return vbslq_s16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i32> [[VBSL5_I]] +// int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { return vbslq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) { return vbslq_s64(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) { return vbslq_u8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) { return vbslq_u16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <4 x i32> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vbslq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]], <4 x i32> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <4 x i32> [[VBSL5_I]] +// int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) { return vbslq_s32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]], <2 x i64> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) { return vbslq_u64(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> -// CHECK: [[VBSL3_I:%.*]] = and <4 x i32> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <4 x i32> %v1, splat (i32 -1) -// CHECK: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> -// CHECK: ret <4 x float> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x float> @test_vbslq_f32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]], <4 x float> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <4 x float> [[V2]] to <4 x i32> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <4 x float> [[V3]] to <4 x i32> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i32> [[V1]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i32> [[V1]], splat (i32 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i32> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i32> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) { return vbslq_f32(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_p8( -// CHECK: [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP0:%.*]] = xor <16 x i8> %v1, splat (i8 -1) -// CHECK: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3 -// CHECK: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] -// CHECK: ret <16 x i8> [[VBSL2_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]], <16 x i8> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or disjoint <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) { return vbslq_p8(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8> -// CHECK: [[VBSL3_I:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = xor <8 x i16> %v1, splat (i16 -1) -// CHECK: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], %v3 -// CHECK: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <8 x i16> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vbslq_p16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]], <8 x i16> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[V1]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[V3]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <8 x i16> [[VBSL5_I]] +// poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) { return vbslq_p16(v1, v2, v3); } -// CHECK-LABEL: @test_vbslq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8> -// CHECK: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %v1, [[VBSL1_I]] -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %v1, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]] -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double> -// CHECK: ret <2 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x double> @test_vbslq_f64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]], <2 x double> noundef [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <2 x double> [[V2]] to <2 x i64> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <2 x double> [[V3]] to <2 x i64> +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[V1]], [[VBSL1_I]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[V1]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP1]] +// float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) { return vbslq_f64(v1, v2, v3); } -// CHECK-LABEL: @test_vrecps_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x float> [[VRECPS_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrecps_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[V1]], <2 x float> [[V2]]) +// CHECK-NEXT: ret <2 x float> [[VRECPS_V2_I]] +// float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) { return vrecps_f32(v1, v2); } -// CHECK-LABEL: @test_vrecpsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrecpsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[V1]], <4 x float> [[V2]]) +// CHECK-NEXT: ret <4 x float> [[VRECPSQ_V2_I]] +// float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) { return vrecpsq_f32(v1, v2); } -// CHECK-LABEL: @test_vrecpsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrecpsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[V1]], <2 x double> [[V2]]) +// CHECK-NEXT: ret <2 x double> [[VRECPSQ_V2_I]] +// float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) { return vrecpsq_f64(v1, v2); } -// CHECK-LABEL: @test_vrsqrts_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrts_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[V1]], <2 x float> [[V2]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTS_V2_I]] +// float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) { return vrsqrts_f32(v1, v2); } -// CHECK-LABEL: @test_vrsqrtsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrtsq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[V1]], <4 x float> [[V2]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTSQ_V2_I]] +// float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) { return vrsqrtsq_f32(v1, v2); } -// CHECK-LABEL: @test_vrsqrtsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrtsq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[V1]], <2 x double> [[V2]]) +// CHECK-NEXT: ret <2 x double> [[VRSQRTSQ_V2_I]] +// float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) { return vrsqrtsq_f64(v1, v2); } -// CHECK-LABEL: @test_vcage_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x i32> [[VCAGE_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcage_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[V1]], <2 x float> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGE_V2_I]] +// uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) { return vcage_f32(v1, v2); } -// CHECK-LABEL: @test_vcage_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x i64> [[VCAGE_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcage_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VCAGE_V2_I]] +// uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) { return vcage_f64(a, b); } -// CHECK-LABEL: @test_vcageq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcageq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[V1]], <4 x float> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGEQ_V2_I]] +// uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) { return vcageq_f32(v1, v2); } -// CHECK-LABEL: @test_vcageq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x i64> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcageq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[V1]], <2 x double> [[V2]]) +// CHECK-NEXT: ret <2 x i64> [[VCAGEQ_V2_I]] +// uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) { return vcageq_f64(v1, v2); } -// CHECK-LABEL: @test_vcagt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v1, <2 x float> %v2) -// CHECK: ret <2 x i32> [[VCAGT_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcagt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[V1]], <2 x float> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGT_V2_I]] +// uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) { return vcagt_f32(v1, v2); } -// CHECK-LABEL: @test_vcagt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x i64> [[VCAGT_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcagt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VCAGT_V2_I]] +// uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) { return vcagt_f64(a, b); } -// CHECK-LABEL: @test_vcagtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v1, <4 x float> %v2) -// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcagtq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[V1]], <4 x float> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGTQ_V2_I]] +// uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) { return vcagtq_f32(v1, v2); } -// CHECK-LABEL: @test_vcagtq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v1, <2 x double> %v2) -// CHECK: ret <2 x i64> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcagtq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[V1]], <2 x double> [[V2]]) +// CHECK-NEXT: ret <2 x i64> [[VCAGTQ_V2_I]] +// uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) { return vcagtq_f64(v1, v2); } -// CHECK-LABEL: @test_vcale_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) -// CHECK: ret <2 x i32> [[VCALE_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcale_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[V2]], <2 x float> [[V1]]) +// CHECK-NEXT: ret <2 x i32> [[VCALE_V2_I]] +// uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) { return vcale_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcale_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> %b, <1 x double> %a) -// CHECK: ret <1 x i64> [[VCALE_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcale_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[B]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCALE_V2_I]] +// uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) { return vcale_f64(a, b); } -// CHECK-LABEL: @test_vcaleq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) -// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcaleq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[V2]], <4 x float> [[V1]]) +// CHECK-NEXT: ret <4 x i32> [[VCALEQ_V2_I]] +// uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) { return vcaleq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcaleq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) -// CHECK: ret <2 x i64> [[VCALEQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcaleq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[V2]], <2 x double> [[V1]]) +// CHECK-NEXT: ret <2 x i64> [[VCALEQ_V2_I]] +// uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) { return vcaleq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcalt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %v2, <2 x float> %v1) -// CHECK: ret <2 x i32> [[VCALT_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcalt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[V2]], <2 x float> [[V1]]) +// CHECK-NEXT: ret <2 x i32> [[VCALT_V2_I]] +// uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) { return vcalt_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcalt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> %b, <1 x double> %a) -// CHECK: ret <1 x i64> [[VCALT_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcalt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[B]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCALT_V2_I]] +// uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) { return vcalt_f64(a, b); } -// CHECK-LABEL: @test_vcaltq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %v2, <4 x float> %v1) -// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcaltq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[V2]], <4 x float> [[V1]]) +// CHECK-NEXT: ret <4 x i32> [[VCALTQ_V2_I]] +// uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) { return vcaltq_f32(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vcaltq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %v2, <2 x double> %v1) -// CHECK: ret <2 x i64> [[VCALTQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcaltq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[V2]], <2 x double> [[V1]]) +// CHECK-NEXT: ret <2 x i64> [[VCALTQ_V2_I]] +// uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) { return vcaltq_f64(v1, v2); // Using registers other than v0, v1 are possible, but would be odd. } -// CHECK-LABEL: @test_vtst_s8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) { return vtst_s8(v1, v2); } -// CHECK-LABEL: @test_vtst_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) { return vtst_s16(v1, v2); } -// CHECK-LABEL: @test_vtst_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) { return vtst_s32(v1, v2); } -// CHECK-LABEL: @test_vtst_u8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) { return vtst_u8(v1, v2); } -// CHECK-LABEL: @test_vtst_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) { return vtst_u16(v1, v2); } -// CHECK-LABEL: @test_vtst_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtst_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) { return vtst_u32(v1, v2); } -// CHECK-LABEL: @test_vtstq_s8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) { return vtstq_s8(v1, v2); } -// CHECK-LABEL: @test_vtstq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) { return vtstq_s16(v1, v2); } -// CHECK-LABEL: @test_vtstq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) { return vtstq_s32(v1, v2); } -// CHECK-LABEL: @test_vtstq_u8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) { return vtstq_u8(v1, v2); } -// CHECK-LABEL: @test_vtstq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) { return vtstq_u16(v1, v2); } -// CHECK-LABEL: @test_vtstq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtstq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) { return vtstq_u32(v1, v2); } -// CHECK-LABEL: @test_vtstq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) { return vtstq_s64(v1, v2); } -// CHECK-LABEL: @test_vtstq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i64> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) { return vtstq_u64(v1, v2); } -// CHECK-LABEL: @test_vtst_p8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtst_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) { return vtst_p8(v1, v2); } -// CHECK-LABEL: @test_vtst_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtst_p16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) { return vtst_p16(v1, v2); } -// CHECK-LABEL: @test_vtstq_p8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %v1, %v2 -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtstq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) { return vtstq_p8(v1, v2); } -// CHECK-LABEL: @test_vtstq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %v1, %v2 -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtstq_p16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) { return vtstq_p16(v1, v2); } -// CHECK-LABEL: @test_vtst_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) { return vtst_s64(a, b); } -// CHECK-LABEL: @test_vtst_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <1 x i64> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP3]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) { return vtst_u64(a, b); } -// CHECK-LABEL: @test_vceq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) { return vceq_s8(v1, v2); } -// CHECK-LABEL: @test_vceq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) { return vceq_s16(v1, v2); } -// CHECK-LABEL: @test_vceq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) { return vceq_s32(v1, v2); } -// CHECK-LABEL: @test_vceq_s64( -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) { return vceq_s64(a, b); } -// CHECK-LABEL: @test_vceq_u64( -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) { return vceq_u64(a, b); } -// CHECK-LABEL: @test_vceq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) { return vceq_f32(v1, v2); } -// CHECK-LABEL: @test_vceq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) { return vceq_f64(a, b); } -// CHECK-LABEL: @test_vceq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) { return vceq_u8(v1, v2); } -// CHECK-LABEL: @test_vceq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) { return vceq_u16(v1, v2); } -// CHECK-LABEL: @test_vceq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceq_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) { return vceq_u32(v1, v2); } -// CHECK-LABEL: @test_vceq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceq_p8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) { return vceq_p8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) { return vceqq_s8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) { return vceqq_s16(v1, v2); } -// CHECK-LABEL: @test_vceqq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) { return vceqq_s32(v1, v2); } -// CHECK-LABEL: @test_vceqq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) { return vceqq_f32(v1, v2); } -// CHECK-LABEL: @test_vceqq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) { return vceqq_u8(v1, v2); } -// CHECK-LABEL: @test_vceqq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) { return vceqq_u16(v1, v2); } -// CHECK-LABEL: @test_vceqq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) { return vceqq_u32(v1, v2); } -// CHECK-LABEL: @test_vceqq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqq_p8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) { return vceqq_p8(v1, v2); } -// CHECK-LABEL: @test_vceqq_s64( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) { return vceqq_s64(v1, v2); } -// CHECK-LABEL: @test_vceqq_u64( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) { return vceqq_u64(v1, v2); } -// CHECK-LABEL: @test_vceqq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) { return vceqq_f64(v1, v2); } -// CHECK-LABEL: @test_vcge_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) { return vcge_s8(v1, v2); } -// CHECK-LABEL: @test_vcge_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) { return vcge_s16(v1, v2); } -// CHECK-LABEL: @test_vcge_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) { return vcge_s32(v1, v2); } -// CHECK-LABEL: @test_vcge_s64( -// CHECK: [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) { return vcge_s64(a, b); } -// CHECK-LABEL: @test_vcge_u64( -// CHECK: [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) { return vcge_u64(a, b); } -// CHECK-LABEL: @test_vcge_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) { return vcge_f32(v1, v2); } -// CHECK-LABEL: @test_vcge_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcge_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) { return vcge_f64(a, b); } -// CHECK-LABEL: @test_vcge_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcge_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) { return vcge_u8(v1, v2); } -// CHECK-LABEL: @test_vcge_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) { return vcge_u16(v1, v2); } -// CHECK-LABEL: @test_vcge_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcge_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) { return vcge_u32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) { return vcgeq_s8(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) { return vcgeq_s16(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) { return vcgeq_s32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) { return vcgeq_f32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgeq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) { return vcgeq_u8(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) { return vcgeq_u16(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgeq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) { return vcgeq_u32(v1, v2); } -// CHECK-LABEL: @test_vcgeq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) { return vcgeq_s64(v1, v2); } -// CHECK-LABEL: @test_vcgeq_u64( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) { return vcgeq_u64(v1, v2); } -// CHECK-LABEL: @test_vcgeq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgeq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) { return vcgeq_f64(v1, v2); } -// CHECK-LABEL: @test_vcle_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] // Notes about vcle: // LE condition predicate implemented as GE, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) { return vcle_s8(v1, v2); } -// CHECK-LABEL: @test_vcle_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) { return vcle_s16(v1, v2); } -// CHECK-LABEL: @test_vcle_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) { return vcle_s32(v1, v2); } -// CHECK-LABEL: @test_vcle_s64( -// CHECK: [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) { return vcle_s64(a, b); } -// CHECK-LABEL: @test_vcle_u64( -// CHECK: [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) { return vcle_u64(a, b); } -// CHECK-LABEL: @test_vcle_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) { return vcle_f32(v1, v2); } -// CHECK-LABEL: @test_vcle_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcle_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) { return vcle_f64(a, b); } -// CHECK-LABEL: @test_vcle_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcle_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) { return vcle_u8(v1, v2); } -// CHECK-LABEL: @test_vcle_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) { return vcle_u16(v1, v2); } -// CHECK-LABEL: @test_vcle_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcle_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) { return vcle_u32(v1, v2); } -// CHECK-LABEL: @test_vcleq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) { return vcleq_s8(v1, v2); } -// CHECK-LABEL: @test_vcleq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) { return vcleq_s16(v1, v2); } -// CHECK-LABEL: @test_vcleq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) { return vcleq_s32(v1, v2); } -// CHECK-LABEL: @test_vcleq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) { return vcleq_f32(v1, v2); } -// CHECK-LABEL: @test_vcleq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcleq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) { return vcleq_u8(v1, v2); } -// CHECK-LABEL: @test_vcleq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) { return vcleq_u16(v1, v2); } -// CHECK-LABEL: @test_vcleq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcleq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) { return vcleq_u32(v1, v2); } -// CHECK-LABEL: @test_vcleq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) { return vcleq_s64(v1, v2); } -// CHECK-LABEL: @test_vcleq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) { return vcleq_u64(v1, v2); } -// CHECK-LABEL: @test_vcleq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcleq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) { return vcleq_f64(v1, v2); } -// CHECK-LABEL: @test_vcgt_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) { return vcgt_s8(v1, v2); } -// CHECK-LABEL: @test_vcgt_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) { return vcgt_s16(v1, v2); } -// CHECK-LABEL: @test_vcgt_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) { return vcgt_s32(v1, v2); } -// CHECK-LABEL: @test_vcgt_s64( -// CHECK: [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) { return vcgt_s64(a, b); } -// CHECK-LABEL: @test_vcgt_u64( -// CHECK: [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) { return vcgt_u64(a, b); } -// CHECK-LABEL: @test_vcgt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) { return vcgt_f32(v1, v2); } -// CHECK-LABEL: @test_vcgt_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) { return vcgt_f64(a, b); } -// CHECK-LABEL: @test_vcgt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgt_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) { return vcgt_u8(v1, v2); } -// CHECK-LABEL: @test_vcgt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) { return vcgt_u16(v1, v2); } -// CHECK-LABEL: @test_vcgt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgt_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) { return vcgt_u32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) { return vcgtq_s8(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) { return vcgtq_s16(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) { return vcgtq_s32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) { return vcgtq_f32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) { return vcgtq_u8(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) { return vcgtq_u16(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) { return vcgtq_u32(v1, v2); } -// CHECK-LABEL: @test_vcgtq_s64( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) { return vcgtq_s64(v1, v2); } -// CHECK-LABEL: @test_vcgtq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) { return vcgtq_u64(v1, v2); } -// CHECK-LABEL: @test_vcgtq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) { return vcgtq_f64(v1, v2); } -// CHECK-LABEL: @test_vclt_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] // Notes about vclt: // LT condition predicate implemented as GT, so check reversed operands. // Using registers other than v0, v1 are possible, but would be odd. +// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) { return vclt_s8(v1, v2); } -// CHECK-LABEL: @test_vclt_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) { return vclt_s16(v1, v2); } -// CHECK-LABEL: @test_vclt_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) { return vclt_s32(v1, v2); } -// CHECK-LABEL: @test_vclt_s64( -// CHECK: [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) { return vclt_s64(a, b); } -// CHECK-LABEL: @test_vclt_u64( -// CHECK: [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) { return vclt_u64(a, b); } -// CHECK-LABEL: @test_vclt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_f32( +// CHECK-SAME: <2 x float> noundef [[V1:%.*]], <2 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) { return vclt_f32(v1, v2); } -// CHECK-LABEL: @test_vclt_f64( -// CHECK: [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <1 x double> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) { return vclt_f64(a, b); } -// CHECK-LABEL: @test_vclt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclt_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) { return vclt_u8(v1, v2); } -// CHECK-LABEL: @test_vclt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) { return vclt_u16(v1, v2); } -// CHECK-LABEL: @test_vclt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclt_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) { return vclt_u32(v1, v2); } -// CHECK-LABEL: @test_vcltq_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) { return vcltq_s8(v1, v2); } -// CHECK-LABEL: @test_vcltq_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) { return vcltq_s16(v1, v2); } -// CHECK-LABEL: @test_vcltq_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) { return vcltq_s32(v1, v2); } -// CHECK-LABEL: @test_vcltq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_f32( +// CHECK-SAME: <4 x float> noundef [[V1:%.*]], <4 x float> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) { return vcltq_f32(v1, v2); } -// CHECK-LABEL: @test_vcltq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) { return vcltq_u8(v1, v2); } -// CHECK-LABEL: @test_vcltq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i16> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) { return vcltq_u16(v1, v2); } -// CHECK-LABEL: @test_vcltq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i32> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) { return vcltq_u32(v1, v2); } -// CHECK-LABEL: @test_vcltq_s64( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_s64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) { return vcltq_s64(v1, v2); } -// CHECK-LABEL: @test_vcltq_u64( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_u64( +// CHECK-SAME: <2 x i64> noundef [[V1:%.*]], <2 x i64> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i64> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) { return vcltq_u64(v1, v2); } -// CHECK-LABEL: @test_vcltq_f64( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2 -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltq_f64( +// CHECK-SAME: <2 x double> noundef [[V1:%.*]], <2 x double> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x double> [[V1]], [[V2]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) { return vcltq_f64(v1, v2); } -// CHECK-LABEL: @test_vhadd_s8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) { return vhadd_s8(v1, v2); } -// CHECK-LABEL: @test_vhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VHADD_V2_I]] +// int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) { return vhadd_s16(v1, v2); } -// CHECK-LABEL: @test_vhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VHADD_V2_I]] +// int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) { return vhadd_s32(v1, v2); } -// CHECK-LABEL: @test_vhadd_u8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) { return vhadd_u8(v1, v2); } -// CHECK-LABEL: @test_vhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VHADD_V2_I]] +// uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) { return vhadd_u16(v1, v2); } -// CHECK-LABEL: @test_vhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VHADD_V2_I]] +// uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) { return vhadd_u32(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) { return vhaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VHADDQ_V2_I]] +// int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) { return vhaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VHADDQ_V2_I]] +// int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) { return vhaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vhaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VHADDQ_V2_I]] +// uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vhaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VHADDQ_V2_I]] +// uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vhaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vhsub_s8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) { return vhsub_s8(v1, v2); } -// CHECK-LABEL: @test_vhsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VHSUB_V2_I]] +// int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) { return vhsub_s16(v1, v2); } -// CHECK-LABEL: @test_vhsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VHSUB_V2_I]] +// int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) { return vhsub_s32(v1, v2); } -// CHECK-LABEL: @test_vhsub_u8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vhsub_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) { return vhsub_u8(v1, v2); } -// CHECK-LABEL: @test_vhsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vhsub_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VHSUB_V2_I]] +// uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) { return vhsub_u16(v1, v2); } -// CHECK-LABEL: @test_vhsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vhsub_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VHSUB_V2_I]] +// uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) { return vhsub_u32(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) { return vhsubq_s8(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VHSUBQ_V2_I]] +// int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) { return vhsubq_s16(v1, v2); } -// CHECK-LABEL: @test_vhsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VHSUBQ_V2_I]] +// int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) { return vhsubq_s32(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vhsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) { return vhsubq_u8(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vhsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VHSUBQ_V2_I]] +// uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) { return vhsubq_u16(v1, v2); } -// CHECK-LABEL: @test_vhsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vhsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VHSUBQ_V2_I]] +// uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) { return vhsubq_u32(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) { return vrhadd_s8(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VRHADD_V2_I]] +// int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) { return vrhadd_s16(v1, v2); } -// CHECK-LABEL: @test_vrhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VRHADD_V2_I]] +// int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) { return vrhadd_s32(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> [[V1]], <8 x i8> [[V2]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) { return vrhadd_u8(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %v1, <4 x i16> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[V1:%.*]], <4 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[V1]], <4 x i16> [[V2]]) +// CHECK-NEXT: ret <4 x i16> [[VRHADD_V2_I]] +// uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) { return vrhadd_u16(v1, v2); } -// CHECK-LABEL: @test_vrhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %v1, <2 x i32> %v2) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[V1:%.*]], <2 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[V1]], <2 x i32> [[V2]]) +// CHECK-NEXT: ret <2 x i32> [[VRHADD_V2_I]] +// uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) { return vrhadd_u32(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) { return vrhaddq_s8(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VRHADDQ_V2_I]] +// int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) { return vrhaddq_s16(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VRHADDQ_V2_I]] +// int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) { return vrhaddq_s32(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> [[V1]], <16 x i8> [[V2]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) { return vrhaddq_u8(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %v1, <8 x i16> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[V1:%.*]], <8 x i16> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[V1]], <8 x i16> [[V2]]) +// CHECK-NEXT: ret <8 x i16> [[VRHADDQ_V2_I]] +// uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) { return vrhaddq_u16(v1, v2); } -// CHECK-LABEL: @test_vrhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %v1, <4 x i32> %v2) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[V1:%.*]], <4 x i32> noundef [[V2:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[V1]], <4 x i32> [[V2]]) +// CHECK-NEXT: ret <4 x i32> [[VRHADDQ_V2_I]] +// uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) { return vrhaddq_u32(v1, v2); } -// CHECK-LABEL: @test_vqadd_s8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); } -// CHECK-LABEL: @test_vqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); } -// CHECK-LABEL: @test_vqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); } -// CHECK-LABEL: @test_vqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQADD_V2_I]] +// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); } -// CHECK-LABEL: @test_vqadd_u8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); } -// CHECK-LABEL: @test_vqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); } -// CHECK-LABEL: @test_vqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); } -// CHECK-LABEL: @test_vqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQADD_V2_I]] +// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); } -// CHECK-LABEL: @test_vqaddq_s8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { return vqaddq_s8(a, b); } -// CHECK-LABEL: @test_vqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { return vqaddq_s16(a, b); } -// CHECK-LABEL: @test_vqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { return vqaddq_s32(a, b); } -// CHECK-LABEL: @test_vqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQADDQ_V2_I]] +// int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); } -// CHECK-LABEL: @test_vqaddq_u8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { return vqaddq_u8(a, b); } -// CHECK-LABEL: @test_vqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { return vqaddq_u16(a, b); } -// CHECK-LABEL: @test_vqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { return vqaddq_u32(a, b); } -// CHECK-LABEL: @test_vqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQADDQ_V2_I]] +// uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); } -// CHECK-LABEL: @test_vqsub_s8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); } -// CHECK-LABEL: @test_vqsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); } -// CHECK-LABEL: @test_vqsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); } -// CHECK-LABEL: @test_vqsub_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSUB_V2_I]] +// int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); } -// CHECK-LABEL: @test_vqsub_u8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); } -// CHECK-LABEL: @test_vqsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); } -// CHECK-LABEL: @test_vqsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); } -// CHECK-LABEL: @test_vqsub_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSUB_V2_I]] +// uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); } -// CHECK-LABEL: @test_vqsubq_s8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { return vqsubq_s8(a, b); } -// CHECK-LABEL: @test_vqsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { return vqsubq_s16(a, b); } -// CHECK-LABEL: @test_vqsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { return vqsubq_s32(a, b); } -// CHECK-LABEL: @test_vqsubq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSUBQ_V2_I]] +// int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); } -// CHECK-LABEL: @test_vqsubq_u8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { return vqsubq_u8(a, b); } -// CHECK-LABEL: @test_vqsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { return vqsubq_u16(a, b); } -// CHECK-LABEL: @test_vqsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { return vqsubq_u32(a, b); } -// CHECK-LABEL: @test_vqsubq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSUBQ_V2_I]] +// uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: @test_vshl_s8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); } -// CHECK-LABEL: @test_vshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VSHL_V2_I]] +// int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); } -// CHECK-LABEL: @test_vshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VSHL_V2_I]] +// int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); } -// CHECK-LABEL: @test_vshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VSHL_V2_I]] +// int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); } -// CHECK-LABEL: @test_vshl_u8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); } -// CHECK-LABEL: @test_vshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VSHL_V2_I]] +// uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); } -// CHECK-LABEL: @test_vshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VSHL_V2_I]] +// uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); } -// CHECK-LABEL: @test_vshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VSHL_V2_I]] +// uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); } -// CHECK-LABEL: @test_vshlq_s8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { return vshlq_s8(a, b); } -// CHECK-LABEL: @test_vshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VSHLQ_V2_I]] +// int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { return vshlq_s16(a, b); } -// CHECK-LABEL: @test_vshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VSHLQ_V2_I]] +// int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { return vshlq_s32(a, b); } -// CHECK-LABEL: @test_vshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VSHLQ_V2_I]] +// int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); } -// CHECK-LABEL: @test_vshlq_u8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { return vshlq_u8(a, b); } -// CHECK-LABEL: @test_vshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VSHLQ_V2_I]] +// uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { return vshlq_u16(a, b); } -// CHECK-LABEL: @test_vshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VSHLQ_V2_I]] +// uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { return vshlq_u32(a, b); } -// CHECK-LABEL: @test_vshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VSHLQ_V2_I]] +// uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); } -// CHECK-LABEL: @test_vqshl_s8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); } -// CHECK-LABEL: @test_vqshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_V2_I]] +// int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); } -// CHECK-LABEL: @test_vqshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_V2_I]] +// int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); } -// CHECK-LABEL: @test_vqshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_V2_I]] +// int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); } -// CHECK-LABEL: @test_vqshl_u8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); } -// CHECK-LABEL: @test_vqshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_V2_I]] +// uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); } -// CHECK-LABEL: @test_vqshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_V2_I]] +// uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); } -// CHECK-LABEL: @test_vqshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_V2_I]] +// uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); } -// CHECK-LABEL: @test_vqshlq_s8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { return vqshlq_s8(a, b); } -// CHECK-LABEL: @test_vqshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSHLQ_V2_I]] +// int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { return vqshlq_s16(a, b); } -// CHECK-LABEL: @test_vqshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSHLQ_V2_I]] +// int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { return vqshlq_s32(a, b); } -// CHECK-LABEL: @test_vqshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSHLQ_V2_I]] +// int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); } -// CHECK-LABEL: @test_vqshlq_u8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { return vqshlq_u8(a, b); } -// CHECK-LABEL: @test_vqshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSHLQ_V2_I]] +// uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { return vqshlq_u16(a, b); } -// CHECK-LABEL: @test_vqshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSHLQ_V2_I]] +// uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { return vqshlq_u32(a, b); } -// CHECK-LABEL: @test_vqshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSHLQ_V2_I]] +// uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); } -// CHECK-LABEL: @test_vrshl_s8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); } -// CHECK-LABEL: @test_vrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSHL_V2_I]] +// int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); } -// CHECK-LABEL: @test_vrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSHL_V2_I]] +// int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); } -// CHECK-LABEL: @test_vrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VRSHL_V2_I]] +// int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); } -// CHECK-LABEL: @test_vrshl_u8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); } -// CHECK-LABEL: @test_vrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSHL_V2_I]] +// uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); } -// CHECK-LABEL: @test_vrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSHL_V2_I]] +// uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); } -// CHECK-LABEL: @test_vrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VRSHL_V2_I]] +// uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); } -// CHECK-LABEL: @test_vrshlq_s8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { return vrshlq_s8(a, b); } -// CHECK-LABEL: @test_vrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRSHLQ_V2_I]] +// int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { return vrshlq_s16(a, b); } -// CHECK-LABEL: @test_vrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRSHLQ_V2_I]] +// int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { return vrshlq_s32(a, b); } -// CHECK-LABEL: @test_vrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VRSHLQ_V2_I]] +// int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); } -// CHECK-LABEL: @test_vrshlq_u8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { return vrshlq_u8(a, b); } -// CHECK-LABEL: @test_vrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRSHLQ_V2_I]] +// uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { return vrshlq_u16(a, b); } -// CHECK-LABEL: @test_vrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRSHLQ_V2_I]] +// uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { return vrshlq_u32(a, b); } -// CHECK-LABEL: @test_vrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VRSHLQ_V2_I]] +// uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); } -// CHECK-LABEL: @test_vqrshl_s8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); } -// CHECK-LABEL: @test_vqrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRSHL_V2_I]] +// int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); } -// CHECK-LABEL: @test_vqrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRSHL_V2_I]] +// int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQRSHL_V2_I]] +// int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); } -// CHECK-LABEL: @test_vqrshl_u8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); } -// CHECK-LABEL: @test_vqrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRSHL_V2_I]] +// uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); } -// CHECK-LABEL: @test_vqrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRSHL_V2_I]] +// uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); } -// CHECK-LABEL: @test_vqrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQRSHL_V2_I]] +// uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); } -// CHECK-LABEL: @test_vqrshlq_s8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { return vqrshlq_s8(a, b); } -// CHECK-LABEL: @test_vqrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRSHLQ_V2_I]] +// int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { return vqrshlq_s16(a, b); } -// CHECK-LABEL: @test_vqrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRSHLQ_V2_I]] +// int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { return vqrshlq_s32(a, b); } -// CHECK-LABEL: @test_vqrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQRSHLQ_V2_I]] +// int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); } -// CHECK-LABEL: @test_vqrshlq_u8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); } -// CHECK-LABEL: @test_vqrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRSHLQ_V2_I]] +// uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { return vqrshlq_u16(a, b); } -// CHECK-LABEL: @test_vqrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRSHLQ_V2_I]] +// uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { return vqrshlq_u32(a, b); } -// CHECK-LABEL: @test_vqrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQRSHLQ_V2_I]] +// uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); } -// CHECK-LABEL: @test_vsli_n_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 0) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) { return vsli_n_p64(a, b, 0); } -// CHECK-LABEL: @test_vsliq_n_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 0) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) { return vsliq_n_p64(a, b, 0); } -// CHECK-LABEL: @test_vmax_s8( -// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_I]] +// int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); } -// CHECK-LABEL: @test_vmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX2_I]] +// int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); } -// CHECK-LABEL: @test_vmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX2_I]] +// int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); } -// CHECK-LABEL: @test_vmax_u8( -// CHECK: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_I]] +// uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); } -// CHECK-LABEL: @test_vmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX2_I]] +// uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); } -// CHECK-LABEL: @test_vmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX2_I]] +// uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); } -// CHECK-LABEL: @test_vmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMAX2_I]] +// float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_s8( -// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAX_I]] +// int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { return vmaxq_s8(a, b); } -// CHECK-LABEL: @test_vmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMAX2_I]] +// int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { return vmaxq_s16(a, b); } -// CHECK-LABEL: @test_vmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMAX2_I]] +// int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { return vmaxq_s32(a, b); } -// CHECK-LABEL: @test_vmaxq_u8( -// CHECK: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAX_I]] +// uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { return vmaxq_u8(a, b); } -// CHECK-LABEL: @test_vmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMAX2_I]] +// uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { return vmaxq_u16(a, b); } -// CHECK-LABEL: @test_vmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMAX2_I]] +// uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { return vmaxq_u32(a, b); } -// CHECK-LABEL: @test_vmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMAX2_I]] +// float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmaxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VMAX2_I]] +// float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) { return vmaxq_f64(a, b); } -// CHECK-LABEL: @test_vmin_s8( -// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_I]] +// int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); } -// CHECK-LABEL: @test_vmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN2_I]] +// int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); } -// CHECK-LABEL: @test_vmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN2_I]] +// int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); } -// CHECK-LABEL: @test_vmin_u8( -// CHECK: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_I]] +// uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); } -// CHECK-LABEL: @test_vmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN2_I]] +// uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); } -// CHECK-LABEL: @test_vmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN2_I]] +// uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); } -// CHECK-LABEL: @test_vmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMIN2_I]] +// float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); } -// CHECK-LABEL: @test_vminq_s8( -// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMIN_I]] +// int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { return vminq_s8(a, b); } -// CHECK-LABEL: @test_vminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMIN2_I]] +// int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { return vminq_s16(a, b); } -// CHECK-LABEL: @test_vminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMIN2_I]] +// int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { return vminq_s32(a, b); } -// CHECK-LABEL: @test_vminq_u8( -// CHECK: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMIN_I]] +// uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { return vminq_u8(a, b); } -// CHECK-LABEL: @test_vminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMIN2_I]] +// uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { return vminq_u16(a, b); } -// CHECK-LABEL: @test_vminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMIN2_I]] +// uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { return vminq_u32(a, b); } -// CHECK-LABEL: @test_vminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMIN2_I]] +// float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } -// CHECK-LABEL: @test_vminq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vminq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VMIN2_I]] +// float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) { return vminq_f64(a, b); } -// CHECK-LABEL: @test_vmaxnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmaxnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMAXNM2_I]] +// float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { return vmaxnm_f32(a, b); } -// CHECK-LABEL: @test_vmaxnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmaxnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMAXNM2_I]] +// float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { return vmaxnmq_f32(a, b); } -// CHECK-LABEL: @test_vmaxnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmaxnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VMAXNM2_I]] +// float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) { return vmaxnmq_f64(a, b); } -// CHECK-LABEL: @test_vminnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vminnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMINNM2_I]] +// float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { return vminnm_f32(a, b); } -// CHECK-LABEL: @test_vminnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vminnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMINNM2_I]] +// float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { return vminnmq_f32(a, b); } -// CHECK-LABEL: @test_vminnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vminnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VMINNM2_I]] +// float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) { return vminnmq_f64(a, b); } -// CHECK-LABEL: @test_vpmax_s8( -// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_I]] +// int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); } -// CHECK-LABEL: @test_vpmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX2_I]] +// int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); } -// CHECK-LABEL: @test_vpmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX2_I]] +// int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); } -// CHECK-LABEL: @test_vpmax_u8( -// CHECK: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_I]] +// uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); } -// CHECK-LABEL: @test_vpmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX2_I]] +// uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); } -// CHECK-LABEL: @test_vpmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX2_I]] +// uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); } -// CHECK-LABEL: @test_vpmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMAX2_I]] +// float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); } -// CHECK-LABEL: @test_vpmaxq_s8( -// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMAX_I]] +// int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) { return vpmaxq_s8(a, b); } -// CHECK-LABEL: @test_vpmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPMAX2_I]] +// int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) { return vpmaxq_s16(a, b); } -// CHECK-LABEL: @test_vpmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPMAX2_I]] +// int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) { return vpmaxq_s32(a, b); } -// CHECK-LABEL: @test_vpmaxq_u8( -// CHECK: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMAX_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMAX_I]] +// uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) { return vpmaxq_u8(a, b); } -// CHECK-LABEL: @test_vpmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPMAX2_I]] +// uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) { return vpmaxq_u16(a, b); } -// CHECK-LABEL: @test_vpmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPMAX2_I]] +// uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) { return vpmaxq_u32(a, b); } -// CHECK-LABEL: @test_vpmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VPMAX2_I]] +// float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) { return vpmaxq_f32(a, b); } -// CHECK-LABEL: @test_vpmaxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMAX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VPMAX2_I]] +// float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) { return vpmaxq_f64(a, b); } -// CHECK-LABEL: @test_vpmin_s8( -// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_I]] +// int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); } -// CHECK-LABEL: @test_vpmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN2_I]] +// int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); } -// CHECK-LABEL: @test_vpmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN2_I]] +// int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); } -// CHECK-LABEL: @test_vpmin_u8( -// CHECK: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_I]] +// uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); } -// CHECK-LABEL: @test_vpmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN2_I]] +// uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); } -// CHECK-LABEL: @test_vpmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN2_I]] +// uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); } -// CHECK-LABEL: @test_vpmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMIN2_I]] +// float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); } -// CHECK-LABEL: @test_vpminq_s8( -// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMIN_I]] +// int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) { return vpminq_s8(a, b); } -// CHECK-LABEL: @test_vpminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPMIN2_I]] +// int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) { return vpminq_s16(a, b); } -// CHECK-LABEL: @test_vpminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPMIN2_I]] +// int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) { return vpminq_s32(a, b); } -// CHECK-LABEL: @test_vpminq_u8( -// CHECK: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPMIN_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPMIN_I]] +// uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) { return vpminq_u8(a, b); } -// CHECK-LABEL: @test_vpminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPMIN2_I]] +// uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) { return vpminq_u16(a, b); } -// CHECK-LABEL: @test_vpminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPMIN2_I]] +// uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) { return vpminq_u32(a, b); } -// CHECK-LABEL: @test_vpminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VPMIN2_I]] +// float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) { return vpminq_f32(a, b); } -// CHECK-LABEL: @test_vpminq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMIN2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpminq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VPMIN2_I]] +// float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) { return vpminq_f64(a, b); } -// CHECK-LABEL: @test_vpmaxnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpmaxnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMAXNM2_I]] +// float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) { return vpmaxnm_f32(a, b); } -// CHECK-LABEL: @test_vpmaxnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpmaxnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VPMAXNM2_I]] +// float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) { return vpmaxnmq_f32(a, b); } -// CHECK-LABEL: @test_vpmaxnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMAXNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpmaxnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VPMAXNM2_I]] +// float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) { return vpmaxnmq_f64(a, b); } -// CHECK-LABEL: @test_vpminnm_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpminnm_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMINNM2_I]] +// float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) { return vpminnm_f32(a, b); } -// CHECK-LABEL: @test_vpminnmq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpminnmq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VPMINNM2_I]] +// float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) { return vpminnmq_f32(a, b); } -// CHECK-LABEL: @test_vpminnmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VPMINNM2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpminnmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VPMINNM2_I]] +// float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) { return vpminnmq_f64(a, b); } -// CHECK-LABEL: @test_vpadd_s8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); } -// CHECK-LABEL: @test_vpadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADD_V2_I]] +// int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); } -// CHECK-LABEL: @test_vpadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADD_V2_I]] +// int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); } -// CHECK-LABEL: @test_vpadd_u8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vpadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); } -// CHECK-LABEL: @test_vpadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADD_V2_I]] +// uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); } -// CHECK-LABEL: @test_vpadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADD_V2_I]] +// uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); } -// CHECK-LABEL: @test_vpadd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPADD_V2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vpadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPADD_V2_I]] +// float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); } -// CHECK-LABEL: @test_vpaddq_s8( -// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPADDQ_V_I]] +// int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) { return vpaddq_s8(a, b); } -// CHECK-LABEL: @test_vpaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDQ_V2_I]] +// int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) { return vpaddq_s16(a, b); } -// CHECK-LABEL: @test_vpaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDQ_V2_I]] +// int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) { return vpaddq_s32(a, b); } -// CHECK-LABEL: @test_vpaddq_u8( -// CHECK: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VPADDQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vpaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VPADDQ_V_I]] +// uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) { return vpaddq_u8(a, b); } -// CHECK-LABEL: @test_vpaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDQ_V2_I]] +// uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) { return vpaddq_u16(a, b); } -// CHECK-LABEL: @test_vpaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDQ_V2_I]] +// uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) { return vpaddq_u32(a, b); } -// CHECK-LABEL: @test_vpaddq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vpaddq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VPADDQ_V2_I]] +// float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) { return vpaddq_f32(a, b); } -// CHECK-LABEL: @test_vpaddq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x double> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vpaddq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VPADDQ_V2_I]] +// float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) { return vpaddq_f64(a, b); } -// CHECK-LABEL: @test_vqdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { return vqdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { return vqdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { return vqrdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vmulx_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x float> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vmulx_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMULX2_I]] +// float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) { return vmulx_f32(a, b); } -// CHECK-LABEL: @test_vmulxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x float> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vmulxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMULX2_I]] +// float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) { return vmulxq_f32(a, b); } -// CHECK-LABEL: @test_vmulxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %b) -// CHECK: ret <2 x double> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vmulxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[B]]) +// CHECK-NEXT: ret <2 x double> [[VMULX2_I]] +// float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) { return vmulxq_f64(a, b); } -// CHECK-LABEL: @test_vshl_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// int8x8_t test_vshl_n_s8(int8x8_t a) { return vshl_n_s8(a, 3); } -// CHECK-LABEL: @test_vshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// int16x4_t test_vshl_n_s16(int16x4_t a) { return vshl_n_s16(a, 3); } -// CHECK-LABEL: @test_vshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// int32x2_t test_vshl_n_s32(int32x2_t a) { return vshl_n_s32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// int8x16_t test_vshlq_n_s8(int8x16_t a) { return vshlq_n_s8(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// int16x8_t test_vshlq_n_s16(int16x8_t a) { return vshlq_n_s16(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// int32x4_t test_vshlq_n_s32(int32x4_t a) { return vshlq_n_s32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// int64x2_t test_vshlq_n_s64(int64x2_t a) { return vshlq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshl_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// uint8x8_t test_vshl_n_u8(uint8x8_t a) { return vshl_n_u8(a, 3); } -// CHECK-LABEL: @test_vshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// uint16x4_t test_vshl_n_u16(uint16x4_t a) { return vshl_n_u16(a, 3); } -// CHECK-LABEL: @test_vshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// uint32x2_t test_vshl_n_u32(uint32x2_t a) { return vshl_n_u32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// uint8x16_t test_vshlq_n_u8(uint8x16_t a) { return vshlq_n_u8(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// uint16x8_t test_vshlq_n_u16(uint16x8_t a) { return vshlq_n_u16(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// uint32x4_t test_vshlq_n_u32(uint32x4_t a) { return vshlq_n_u32(a, 3); } -// CHECK-LABEL: @test_vshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// uint64x2_t test_vshlq_n_u64(uint64x2_t a) { return vshlq_n_u64(a, 3); } -// CHECK-LABEL: @test_vshr_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// int8x8_t test_vshr_n_s8(int8x8_t a) { return vshr_n_s8(a, 3); } -// CHECK-LABEL: @test_vshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// int16x4_t test_vshr_n_s16(int16x4_t a) { return vshr_n_s16(a, 3); } -// CHECK-LABEL: @test_vshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// int32x2_t test_vshr_n_s32(int32x2_t a) { return vshr_n_s32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// int8x16_t test_vshrq_n_s8(int8x16_t a) { return vshrq_n_s8(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// int16x8_t test_vshrq_n_s16(int16x8_t a) { return vshrq_n_s16(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// int32x4_t test_vshrq_n_s32(int32x4_t a) { return vshrq_n_s32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i64> [[A]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// int64x2_t test_vshrq_n_s64(int64x2_t a) { return vshrq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshr_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 3) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// uint8x8_t test_vshr_n_u8(uint8x8_t a) { return vshr_n_u8(a, 3); } -// CHECK-LABEL: @test_vshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// uint16x4_t test_vshr_n_u16(uint16x4_t a) { return vshr_n_u16(a, 3); } -// CHECK-LABEL: @test_vshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// uint32x2_t test_vshr_n_u32(uint32x2_t a) { return vshr_n_u32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 3) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 3) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// uint8x16_t test_vshrq_n_u8(uint8x16_t a) { return vshrq_n_u8(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// uint16x8_t test_vshrq_n_u16(uint16x8_t a) { return vshrq_n_u16(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 3) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i32> [[A]], splat (i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// uint32x4_t test_vshrq_n_u32(uint32x4_t a) { return vshrq_n_u32(a, 3); } -// CHECK-LABEL: @test_vshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 3) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i64> [[A]], splat (i64 3) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// uint64x2_t test_vshrq_n_u64(uint64x2_t a) { return vshrq_n_u64(a, 3); } -// CHECK-LABEL: @test_vsra_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i32> [[B]], splat (i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i32> [[B]], splat (i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 3) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i64> [[B]], splat (i64 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i32> [[B]], splat (i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 3) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 3) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 3) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i32> [[B]], splat (i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 3) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i64> [[B]], splat (i64 3) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vrshr_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// int8x8_t test_vrshr_n_s8(int8x8_t a) { return vrshr_n_s8(a, 3); } -// CHECK-LABEL: @test_vrshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// int16x4_t test_vrshr_n_s16(int16x4_t a) { return vrshr_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// int32x2_t test_vrshr_n_s32(int32x2_t a) { return vrshr_n_s32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// int8x16_t test_vrshrq_n_s8(int8x16_t a) { return vrshrq_n_s8(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// int16x8_t test_vrshrq_n_s16(int16x8_t a) { return vrshrq_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// int32x4_t test_vrshrq_n_s32(int32x4_t a) { return vrshrq_n_s32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// int64x2_t test_vrshrq_n_s64(int64x2_t a) { return vrshrq_n_s64(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -3)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// uint8x8_t test_vrshr_n_u8(uint8x8_t a) { return vrshr_n_u8(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// uint16x4_t test_vrshr_n_u16(uint16x4_t a) { return vrshr_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// uint32x2_t test_vrshr_n_u32(uint32x2_t a) { return vrshr_n_u32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -3)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// uint8x16_t test_vrshrq_n_u8(uint8x16_t a) { return vrshrq_n_u8(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// uint16x8_t test_vrshrq_n_u16(uint16x8_t a) { return vrshrq_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// uint32x4_t test_vrshrq_n_u32(uint32x4_t a) { return vrshrq_n_u32(a, 3); } -// CHECK-LABEL: @test_vrshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// uint64x2_t test_vrshrq_n_u64(uint64x2_t a) { return vrshrq_n_u64(a, 3); } -// CHECK-LABEL: @test_vrsra_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -3)) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VRSHR_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vrsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -3)) -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vrsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -3)) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSRI_N2]] +// int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { return vsriq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { return vsriq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSRI_N2]] +// int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { return vsriq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsri_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSRI_N2]] +// uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) { return vsriq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) { return vsriq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsriq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSRI_N2]] +// uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) { return vsriq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_p8( -// CHECK: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsri_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSRI_N]] +// poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsri_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15) -// CHECK: ret <4 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsri_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 15) +// CHECK-NEXT: ret <4 x i16> [[VSRI_N2]] +// poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsriq_n_p8( -// CHECK: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSRI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsriq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSRI_N]] +// poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsriq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15) -// CHECK: ret <8 x i16> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsriq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 15) +// CHECK-NEXT: ret <8 x i16> [[VSRI_N2]] +// poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsli_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { return vsliq_n_s8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { return vsliq_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { return vsliq_n_s32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsli_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { return vsliq_n_u8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { return vsliq_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsliq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { return vsliq_n_u32(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsliq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 3) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsli_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsli_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsli_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], i32 15) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vsliq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsliq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, 3); } -// CHECK-LABEL: @test_vsliq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsliq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], i32 15) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, 15); } -// CHECK-LABEL: @test_vqshlu_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 3)) -// CHECK: ret <8 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshlu_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 3)) +// CHECK-NEXT: ret <8 x i8> [[VQSHLU_N]] +// uint8x8_t test_vqshlu_n_s8(int8x8_t a) { return vqshlu_n_s8(a, 3); } -// CHECK-LABEL: @test_vqshlu_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 3)) -// CHECK: ret <4 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshlu_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 3)) +// CHECK-NEXT: ret <4 x i16> [[VQSHLU_N1]] +// uint16x4_t test_vqshlu_n_s16(int16x4_t a) { return vqshlu_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshlu_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 3)) -// CHECK: ret <2 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshlu_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 3)) +// CHECK-NEXT: ret <2 x i32> [[VQSHLU_N1]] +// uint32x2_t test_vqshlu_n_s32(int32x2_t a) { return vqshlu_n_s32(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 3)) -// CHECK: ret <16 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshluq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 3)) +// CHECK-NEXT: ret <16 x i8> [[VQSHLU_N]] +// uint8x16_t test_vqshluq_n_s8(int8x16_t a) { return vqshluq_n_s8(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 3)) -// CHECK: ret <8 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshluq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 3)) +// CHECK-NEXT: ret <8 x i16> [[VQSHLU_N1]] +// uint16x8_t test_vqshluq_n_s16(int16x8_t a) { return vqshluq_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 3)) -// CHECK: ret <4 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshluq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 3)) +// CHECK-NEXT: ret <4 x i32> [[VQSHLU_N1]] +// uint32x4_t test_vqshluq_n_s32(int32x4_t a) { return vqshluq_n_s32(a, 3); } -// CHECK-LABEL: @test_vqshluq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 3)) -// CHECK: ret <2 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshluq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 3)) +// CHECK-NEXT: ret <2 x i64> [[VQSHLU_N1]] +// uint64x2_t test_vqshluq_n_s64(int64x2_t a) { return vqshluq_n_s64(a, 3); } -// CHECK-LABEL: @test_vshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// int8x8_t test_vshrn_n_s16(int16x8_t a) { return vshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// int16x4_t test_vshrn_n_s32(int32x4_t a) { return vshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// int32x2_t test_vshrn_n_s64(int64x2_t a) { return vshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// uint8x8_t test_vshrn_n_u16(uint16x8_t a) { return vshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// uint16x4_t test_vshrn_n_u32(uint32x4_t a) { return vshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// uint32x2_t test_vshrn_n_u64(uint64x2_t a) { return vshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[B]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[B]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 3) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[B]], splat (i16 3) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VSHRN_N]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 9) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[B]], splat (i32 9) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VSHRN_N]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 19) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[B]], splat (i64 19) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VSHRN_N]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRUN_N1]] +// uint8x8_t test_vqshrun_n_s16(int16x8_t a) { return vqshrun_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRUN_N1]] +// uint16x4_t test_vqshrun_n_s32(int32x4_t a) { return vqshrun_n_s32(a, 9); } -// CHECK-LABEL: @test_vqshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRUN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRUN_N1]] +// uint32x2_t test_vqshrun_n_s64(int64x2_t a) { return vqshrun_n_s64(a, 19); } -// CHECK-LABEL: @test_vqshrun_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrun_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRUN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) { return vqshrun_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqshrun_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrun_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRUN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) { return vqshrun_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqshrun_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrun_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRUN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) { return vqshrun_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// int8x8_t test_vrshrn_n_s16(int16x8_t a) { return vrshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// int16x4_t test_vrshrn_n_s32(int32x4_t a) { return vrshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// int32x2_t test_vrshrn_n_s64(int64x2_t a) { return vrshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { return vrshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { return vrshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { return vrshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vrshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vrshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vrshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vrshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vrshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vrshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vrshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VRSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vrshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vrshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VRSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vrshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vrshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VRSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vrshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRUN_N1]] +// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { return vqrshrun_n_s16(a, 3); } -// CHECK-LABEL: @test_vqrshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRUN_N1]] +// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { return vqrshrun_n_s32(a, 9); } -// CHECK-LABEL: @test_vqrshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRUN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRUN_N1]] +// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { return vqrshrun_n_s64(a, 19); } -// CHECK-LABEL: @test_vqrshrun_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrun_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRUN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) { return vqrshrun_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrun_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrun_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRUN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) { return vqrshrun_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrun_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrun_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRUN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) { return vqrshrun_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// int8x8_t test_vqshrn_n_s16(int16x8_t a) { return vqshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vqshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// int16x4_t test_vqshrn_n_s32(int32x4_t a) { return vqshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vqshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// int32x2_t test_vqshrn_n_s64(int64x2_t a) { return vqshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vqshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { return vqshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vqshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { return vqshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vqshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { return vqshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vqshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vqshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vqshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vqshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vqshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vqshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vqshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vqshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vqshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// int8x8_t test_vqrshrn_n_s16(int16x8_t a) { return vqrshrn_n_s16(a, 3); } -// CHECK-LABEL: @test_vqrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// int16x4_t test_vqrshrn_n_s32(int32x4_t a) { return vqrshrn_n_s32(a, 9); } -// CHECK-LABEL: @test_vqrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// int32x2_t test_vqrshrn_n_s64(int64x2_t a) { return vqrshrn_n_s64(a, 19); } -// CHECK-LABEL: @test_vqrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[A]], i32 3) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { return vqrshrn_n_u16(a, 3); } -// CHECK-LABEL: @test_vqrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[A]], i32 9) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { return vqrshrn_n_u32(a, 9); } -// CHECK-LABEL: @test_vqrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[A]], i32 19) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { return vqrshrn_n_u64(a, 19); } -// CHECK-LABEL: @test_vqrshrn_high_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) { return vqrshrn_high_n_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrn_high_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) { return vqrshrn_high_n_s32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrn_high_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) { return vqrshrn_high_n_s64(a, b, 19); } -// CHECK-LABEL: @test_vqrshrn_high_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqrshrn_high_n_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[B]], i32 3) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQRSHRN_N1]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) { return vqrshrn_high_n_u16(a, b, 3); } -// CHECK-LABEL: @test_vqrshrn_high_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqrshrn_high_n_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[B]], i32 9) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQRSHRN_N1]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) { return vqrshrn_high_n_u32(a, b, 9); } -// CHECK-LABEL: @test_vqrshrn_high_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19) -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqrshrn_high_n_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[B]], i32 19) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQRSHRN_N1]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) { return vqrshrn_high_n_u64(a, b, 19); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 3); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 9); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 19); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 3); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 9); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 19); } -// CHECK-LABEL: @test_vshll_high_n_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_high_n_s8(int8x16_t a) { return vshll_high_n_s8(a, 3); } -// CHECK-LABEL: @test_vshll_high_n_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_high_n_s16(int16x8_t a) { return vshll_high_n_s16(a, 9); } -// CHECK-LABEL: @test_vshll_high_n_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_high_n_s32(int32x4_t a) { return vshll_high_n_s32(a, 19); } -// CHECK-LABEL: @test_vshll_high_n_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 3) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 3) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { return vshll_high_n_u8(a, 3); } -// CHECK-LABEL: @test_vshll_high_n_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 9) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 9) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { return vshll_high_n_u16(a, 9); } -// CHECK-LABEL: @test_vshll_high_n_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 19) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 19) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { return vshll_high_n_u32(a, 19); } -// CHECK-LABEL: @test_vmovl_s8( -// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vmovl_s8(int8x8_t a) { return vmovl_s8(a); } -// CHECK-LABEL: @test_vmovl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vmovl_s16(int16x4_t a) { return vmovl_s16(a); } -// CHECK-LABEL: @test_vmovl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vmovl_s32(int32x2_t a) { return vmovl_s32(a); } -// CHECK-LABEL: @test_vmovl_u8( -// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vmovl_u8(uint8x8_t a) { return vmovl_u8(a); } -// CHECK-LABEL: @test_vmovl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vmovl_u16(uint16x4_t a) { return vmovl_u16(a); } -// CHECK-LABEL: @test_vmovl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vmovl_u32(uint32x2_t a) { return vmovl_u32(a); } -// CHECK-LABEL: @test_vmovl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vmovl_high_s8(int8x16_t a) { return vmovl_high_s8(a); } -// CHECK-LABEL: @test_vmovl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vmovl_high_s16(int16x8_t a) { return vmovl_high_s16(a); } -// CHECK-LABEL: @test_vmovl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vmovl_high_s32(int32x4_t a) { return vmovl_high_s32(a); } -// CHECK-LABEL: @test_vmovl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vmovl_high_u8(uint8x16_t a) { return vmovl_high_u8(a); } -// CHECK-LABEL: @test_vmovl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vmovl_high_u16(uint16x8_t a) { return vmovl_high_u16(a); } -// CHECK-LABEL: @test_vmovl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vmovl_high_u32(uint32x4_t a) { return vmovl_high_u32(a); } -// CHECK-LABEL: @test_vcvt_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[A]], i32 31) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { return vcvt_n_f32_s32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[A]], i32 31) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { return vcvtq_n_f32_s32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) -// CHECK: ret <2 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[A]], i32 50) +// CHECK-NEXT: ret <2 x double> [[VCVT_N1]] +// float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) { return vcvtq_n_f64_s64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_n_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[A]], i32 31) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { return vcvt_n_f32_u32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_n_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[A]], i32 31) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { return vcvtq_n_f32_u32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50) -// CHECK: ret <2 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_n_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[A]], i32 50) +// CHECK-NEXT: ret <2 x double> [[VCVT_N1]] +// float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) { return vcvtq_n_f64_u64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[A]], i32 31) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { return vcvt_n_s32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[A]], i32 31) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { return vcvtq_n_s32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) -// CHECK: ret <2 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[A]], i32 50) +// CHECK-NEXT: ret <2 x i64> [[VCVT_N1]] +// int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) { return vcvtq_n_s64_f64(a, 50); } -// CHECK-LABEL: @test_vcvt_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_n_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[A]], i32 31) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { return vcvt_n_u32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_n_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[A]], i32 31) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { return vcvtq_n_u32_f32(a, 31); } -// CHECK-LABEL: @test_vcvtq_n_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50) -// CHECK: ret <2 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_n_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[A]], i32 50) +// CHECK-NEXT: ret <2 x i64> [[VCVT_N1]] +// uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) { return vcvtq_n_u64_f64(a, 50); } -// CHECK-LABEL: @test_vaddl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); } -// CHECK-LABEL: @test_vaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); } -// CHECK-LABEL: @test_vaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); } -// CHECK-LABEL: @test_vaddl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); } -// CHECK-LABEL: @test_vaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); } -// CHECK-LABEL: @test_vaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); } -// CHECK-LABEL: @test_vaddl_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) { return vaddl_high_s8(a, b); } -// CHECK-LABEL: @test_vaddl_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) { return vaddl_high_s16(a, b); } -// CHECK-LABEL: @test_vaddl_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <2 x i64> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) { return vaddl_high_s32(a, b); } -// CHECK-LABEL: @test_vaddl_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) { return vaddl_high_u8(a, b); } -// CHECK-LABEL: @test_vaddl_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) { return vaddl_high_u16(a, b); } -// CHECK-LABEL: @test_vaddl_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) { return vaddl_high_u32(a, b); } -// CHECK-LABEL: @test_vaddw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); } -// CHECK-LABEL: @test_vaddw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); } -// CHECK-LABEL: @test_vaddw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); } -// CHECK-LABEL: @test_vaddw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); } -// CHECK-LABEL: @test_vaddw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); } -// CHECK-LABEL: @test_vaddw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); } -// CHECK-LABEL: @test_vaddw_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) { return vaddw_high_s8(a, b); } -// CHECK-LABEL: @test_vaddw_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) { return vaddw_high_s16(a, b); } -// CHECK-LABEL: @test_vaddw_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) { return vaddw_high_s32(a, b); } -// CHECK-LABEL: @test_vaddw_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddw_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) { return vaddw_high_u8(a, b); } -// CHECK-LABEL: @test_vaddw_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddw_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) { return vaddw_high_u16(a, b); } -// CHECK-LABEL: @test_vaddw_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vaddw_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) { return vaddw_high_u32(a, b); } -// CHECK-LABEL: @test_vsubl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); } -// CHECK-LABEL: @test_vsubl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); } -// CHECK-LABEL: @test_vsubl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); } -// CHECK-LABEL: @test_vsubl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); } -// CHECK-LABEL: @test_vsubl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); } -// CHECK-LABEL: @test_vsubl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); } -// CHECK-LABEL: @test_vsubl_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) { return vsubl_high_s8(a, b); } -// CHECK-LABEL: @test_vsubl_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) { return vsubl_high_s16(a, b); } -// CHECK-LABEL: @test_vsubl_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) { return vsubl_high_s32(a, b); } -// CHECK-LABEL: @test_vsubl_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) { return vsubl_high_u8(a, b); } -// CHECK-LABEL: @test_vsubl_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I10_I]] to <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) { return vsubl_high_u16(a, b); } -// CHECK-LABEL: @test_vsubl_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I10_I]] to <2 x i64> +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[TMP0]], [[TMP1]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) { return vsubl_high_u32(a, b); } -// CHECK-LABEL: @test_vsubw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); } -// CHECK-LABEL: @test_vsubw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); } -// CHECK-LABEL: @test_vsubw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); } -// CHECK-LABEL: @test_vsubw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); } -// CHECK-LABEL: @test_vsubw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); } -// CHECK-LABEL: @test_vsubw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); } -// CHECK-LABEL: @test_vsubw_high_s8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) { return vsubw_high_s8(a, b); } -// CHECK-LABEL: @test_vsubw_high_s16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) { return vsubw_high_s16(a, b); } -// CHECK-LABEL: @test_vsubw_high_s32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) { return vsubw_high_s32(a, b); } -// CHECK-LABEL: @test_vsubw_high_u8( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubw_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) { return vsubw_high_u8(a, b); } -// CHECK-LABEL: @test_vsubw_high_u16( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP1]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubw_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) { return vsubw_high_u16(a, b); } -// CHECK-LABEL: @test_vsubw_high_u32( -// CHECK: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP1]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsubw_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) { return vsubw_high_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); } -// CHECK-LABEL: @test_vaddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); } -// CHECK-LABEL: @test_vaddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); } -// CHECK-LABEL: @test_vaddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vaddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); } -// CHECK-LABEL: @test_vaddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vaddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vaddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); } -// CHECK-LABEL: @test_vaddhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vaddhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vaddhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vaddhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vaddhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VADDHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vaddhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vaddhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VADDHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vaddhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vaddhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) -// CHECK: [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vaddhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VADDHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vaddhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vraddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); } -// CHECK-LABEL: @test_vraddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRADDHN_V2_I]] +// int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); } -// CHECK-LABEL: @test_vraddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRADDHN_V2_I]] +// int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); } -// CHECK-LABEL: @test_vraddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vraddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); } -// CHECK-LABEL: @test_vraddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vraddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRADDHN_V2_I]] +// uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); } -// CHECK-LABEL: @test_vraddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vraddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRADDHN_V2_I]] +// uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); } -// CHECK-LABEL: @test_vraddhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vraddhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vraddhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vraddhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vraddhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vraddhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vraddhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRADDHN_V2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vraddhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vraddhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vraddhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRADDHN_V2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vraddhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); } -// CHECK-LABEL: @test_vsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); } -// CHECK-LABEL: @test_vsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); } -// CHECK-LABEL: @test_vsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); } -// CHECK-LABEL: @test_vsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); } -// CHECK-LABEL: @test_vsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); } -// CHECK-LABEL: @test_vsubhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vsubhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vsubhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vsubhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsubhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I_I]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VSUBHN2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vsubhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsubhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I_I]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VSUBHN2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vsubhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vsubhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsubhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I_I]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VSUBHN2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vsubhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); } -// CHECK-LABEL: @test_vrsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSUBHN_V2_I]] +// int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); } -// CHECK-LABEL: @test_vrsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSUBHN_V2_I]] +// int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); } -// CHECK-LABEL: @test_vrsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); } -// CHECK-LABEL: @test_vrsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSUBHN_V2_I]] +// uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); } -// CHECK-LABEL: @test_vrsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSUBHN_V2_I]] +// uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) { return vrsubhn_high_s16(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) { return vrsubhn_high_s32(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) { return vrsubhn_high_s64(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrsubhn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[R:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> [[R]], <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I_I]] +// uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) { return vrsubhn_high_u16(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrsubhn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[R:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> [[VRSUBHN_V2_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I_I]] +// uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) { return vrsubhn_high_u32(r, a, b); } -// CHECK-LABEL: @test_vrsubhn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsubhn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> [[R]], <2 x i32> [[VRSUBHN_V2_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I_I]] +// uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) { return vrsubhn_high_u64(r, a, b); } -// CHECK-LABEL: @test_vabdl_s8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] +// int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); } -// CHECK-LABEL: @test_vabdl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] +// int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); } -// CHECK-LABEL: @test_vabdl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] +// int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); } -// CHECK-LABEL: @test_vabdl_u8( -// CHECK: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I]] +// uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); } -// CHECK-LABEL: @test_vabdl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I]] +// uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); } -// CHECK-LABEL: @test_vabdl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I]] +// uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); } -// CHECK-LABEL: @test_vabal_s8( -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_u8( -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); } -// CHECK-LABEL: @test_vabdl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] +// int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) { return vabdl_high_s8(a, b); } -// CHECK-LABEL: @test_vabdl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] +// int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) { return vabdl_high_s16(a, b); } -// CHECK-LABEL: @test_vabdl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] +// int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) { return vabdl_high_s32(a, b); } -// CHECK-LABEL: @test_vabdl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabdl_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I_I_I]] +// uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) { return vabdl_high_u8(a, b); } -// CHECK-LABEL: @test_vabdl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabdl_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I_I_I]] +// uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) { return vabdl_high_u16(a, b); } -// CHECK-LABEL: @test_vabdl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabdl_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I_I_I]] +// uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) { return vabdl_high_u32(a, b); } -// CHECK-LABEL: @test_vabal_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vabal_high_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vabal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vabal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabal_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vabal_high_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabal_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[VABD2_I_I_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vabal_high_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabal_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[VABD2_I_I_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vabal_high_u32(a, b, c); } -// CHECK-LABEL: @test_vmull_s8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); } -// CHECK-LABEL: @test_vmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); } -// CHECK-LABEL: @test_vmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); } -// CHECK-LABEL: @test_vmull_u8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); } -// CHECK-LABEL: @test_vmull_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); } -// CHECK-LABEL: @test_vmull_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); } -// CHECK-LABEL: @test_vmull_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) { return vmull_high_s8(a, b); } -// CHECK-LABEL: @test_vmull_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) { return vmull_high_s16(a, b); } -// CHECK-LABEL: @test_vmull_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) { return vmull_high_s32(a, b); } -// CHECK-LABEL: @test_vmull_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) { return vmull_high_u8(a, b); } -// CHECK-LABEL: @test_vmull_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: ret <4 x i32> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmull_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) { return vmull_high_u16(a, b); } -// CHECK-LABEL: @test_vmull_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: ret <2 x i64> [[VMULL2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmull_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) { return vmull_high_u32(a, b); } -// CHECK-LABEL: @test_vmlal_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlal_high_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlal_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I_I]] +// uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlal_high_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlal_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I_I]] +// uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlal_high_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlal_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[ADD_I_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I_I]] +// uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlal_high_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] +// int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) { return vmlsl_high_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] +// int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vmlsl_high_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] +// int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vmlsl_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> -// CHECK: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]] -// CHECK: ret <8 x i16> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmlsl_high_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> [[C]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I5_I]], <8 x i8> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I_I]] +// uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) { return vmlsl_high_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]] -// CHECK: ret <4 x i32> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmlsl_high_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I_I]] +// uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) { return vmlsl_high_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_high_u32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]] -// CHECK: ret <2 x i64> [[SUB_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmlsl_high_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[SUB_I_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I_I]] +// uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) { return vmlsl_high_u32(a, b, c); } -// CHECK-LABEL: @test_vqdmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); } -// CHECK-LABEL: @test_vqdmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmull_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmull_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] +// int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) { return vqdmull_high_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmull_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] +// int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) { return vqdmull_high_s32(a, b); } -// CHECK-LABEL: @test_vqdmlal_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlal_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlal_high_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlal_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlal_high_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_high_s16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqdmlsl_high_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I5_I]], <4 x i16> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) { return vqdmlsl_high_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_high_s32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8> -// CHECK: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[SHUFFLE_I7_I]]) -// CHECK: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqdmlsl_high_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5_I:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> [[C]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I5_I]], <2 x i32> [[SHUFFLE_I_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) { return vqdmlsl_high_s32(a, b, c); } -// CHECK-LABEL: @test_vmull_p8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); } -// CHECK-LABEL: @test_vmull_high_p8( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) -// CHECK: ret <8 x i16> [[VMULL_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmull_high_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I5:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I5]], <8 x i8> [[SHUFFLE_I]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I_I]] +// poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) { return vmull_high_p8(a, b); } -// CHECK-LABEL: @test_vaddd_s64( -// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b -// CHECK: ret i64 [[VADDD_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDD_I:%.*]] = add i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VADDD_I]] +// int64_t test_vaddd_s64(int64_t a, int64_t b) { return vaddd_s64(a, b); } -// CHECK-LABEL: @test_vaddd_u64( -// CHECK: [[VADDD_I:%.*]] = add i64 %a, %b -// CHECK: ret i64 [[VADDD_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDD_I:%.*]] = add i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VADDD_I]] +// uint64_t test_vaddd_u64(uint64_t a, uint64_t b) { return vaddd_u64(a, b); } -// CHECK-LABEL: @test_vsubd_s64( -// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b -// CHECK: ret i64 [[VSUBD_I]] +// CHECK-LABEL: define dso_local i64 @test_vsubd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VSUBD_I]] +// int64_t test_vsubd_s64(int64_t a, int64_t b) { return vsubd_s64(a, b); } -// CHECK-LABEL: @test_vsubd_u64( -// CHECK: [[VSUBD_I:%.*]] = sub i64 %a, %b -// CHECK: ret i64 [[VSUBD_I]] +// CHECK-LABEL: define dso_local i64 @test_vsubd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBD_I:%.*]] = sub i64 [[A]], [[B]] +// CHECK-NEXT: ret i64 [[VSUBD_I]] +// uint64_t test_vsubd_u64(uint64_t a, uint64_t b) { return vsubd_u64(a, b); } -// CHECK-LABEL: @test_vqaddb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqaddb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqaddb_s8(int8_t a, int8_t b) { return vqaddb_s8(a, b); } -// CHECK-LABEL: @test_vqaddh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqaddh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqaddh_s16(int16_t a, int16_t b) { return vqaddh_s16(a, b); } -// CHECK-LABEL: @test_vqadds_s32( -// CHECK: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQADDS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqadds_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQADDS_S32_I]] +// int32_t test_vqadds_s32(int32_t a, int32_t b) { return vqadds_s32(a, b); } -// CHECK-LABEL: @test_vqaddd_s64( -// CHECK: [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQADDD_S64_I]] +// int64_t test_vqaddd_s64(int64_t a, int64_t b) { return vqaddd_s64(a, b); } -// CHECK-LABEL: @test_vqaddb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqaddb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) { return vqaddb_u8(a, b); } -// CHECK-LABEL: @test_vqaddh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqaddh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) { return vqaddh_u16(a, b); } -// CHECK-LABEL: @test_vqadds_u32( -// CHECK: [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQADDS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqadds_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQADDS_U32_I]] +// uint32_t test_vqadds_u32(uint32_t a, uint32_t b) { return vqadds_u32(a, b); } -// CHECK-LABEL: @test_vqaddd_u64( -// CHECK: [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQADDD_U64_I]] +// uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) { return vqaddd_u64(a, b); } -// CHECK-LABEL: @test_vqsubb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqsubb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqsubb_s8(int8_t a, int8_t b) { return vqsubb_s8(a, b); } -// CHECK-LABEL: @test_vqsubh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqsubh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqsubh_s16(int16_t a, int16_t b) { return vqsubh_s16(a, b); } -// CHECK-LABEL: @test_vqsubs_s32( -// CHECK: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSUBS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqsubs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSUBS_S32_I]] +// int32_t test_vqsubs_s32(int32_t a, int32_t b) { return vqsubs_s32(a, b); } -// CHECK-LABEL: @test_vqsubd_s64( -// CHECK: [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSUBD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqsubd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSUBD_S64_I]] +// int64_t test_vqsubd_s64(int64_t a, int64_t b) { return vqsubd_s64(a, b); } -// CHECK-LABEL: @test_vqsubb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqsubb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) { return vqsubb_u8(a, b); } -// CHECK-LABEL: @test_vqsubh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqsubh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) { return vqsubh_u16(a, b); } -// CHECK-LABEL: @test_vqsubs_u32( -// CHECK: [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSUBS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqsubs_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSUBS_U32_I]] +// uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) { return vqsubs_u32(a, b); } -// CHECK-LABEL: @test_vqsubd_u64( -// CHECK: [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSUBD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqsubd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSUBD_U64_I]] +// uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) { return vqsubd_u64(a, b); } -// CHECK-LABEL: @test_vshld_s64( -// CHECK: [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSHLD_S64_I]] +// int64_t test_vshld_s64(int64_t a, int64_t b) { return vshld_s64(a, b); } -// CHECK-LABEL: @test_vshld_u64( -// CHECK: [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSHLD_U64_I]] +// uint64_t test_vshld_u64(uint64_t a, int64_t b) { return vshld_u64(a, b); } -// CHECK-LABEL: @test_vqshlb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqshlb_s8(int8_t a, int8_t b) { return vqshlb_s8(a, b); } -// CHECK-LABEL: @test_vqshlh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqshlh_s16(int16_t a, int16_t b) { return vqshlh_s16(a, b); } -// CHECK-LABEL: @test_vqshls_s32( -// CHECK: [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSHLS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSHLS_S32_I]] +// int32_t test_vqshls_s32(int32_t a, int32_t b) { return vqshls_s32(a, b); } -// CHECK-LABEL: @test_vqshld_s64( -// CHECK: [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSHLD_S64_I]] +// int64_t test_vqshld_s64(int64_t a, int64_t b) { return vqshld_s64(a, b); } -// CHECK-LABEL: @test_vqshlb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqshlb_u8(uint8_t a, int8_t b) { return vqshlb_u8(a, b); } -// CHECK-LABEL: @test_vqshlh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqshlh_u16(uint16_t a, int16_t b) { return vqshlh_u16(a, b); } -// CHECK-LABEL: @test_vqshls_u32( -// CHECK: [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQSHLS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQSHLS_U32_I]] +// uint32_t test_vqshls_u32(uint32_t a, int32_t b) { return vqshls_u32(a, b); } -// CHECK-LABEL: @test_vqshld_u64( -// CHECK: [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQSHLD_U64_I]] +// uint64_t test_vqshld_u64(uint64_t a, int64_t b) { return vqshld_u64(a, b); } -// CHECK-LABEL: @test_vrshld_s64( -// CHECK: [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VRSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vrshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VRSHLD_S64_I]] +// int64_t test_vrshld_s64(int64_t a, int64_t b) { return vrshld_s64(a, b); } -// CHECK-LABEL: @test_vrshld_u64( -// CHECK: [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VRSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vrshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VRSHLD_U64_I]] +// uint64_t test_vrshld_u64(uint64_t a, int64_t b) { return vrshld_u64(a, b); } -// CHECK-LABEL: @test_vqrshlb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqrshlb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vqrshlb_s8(int8_t a, int8_t b) { return vqrshlb_s8(a, b); } -// CHECK-LABEL: @test_vqrshlh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrshlh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrshlh_s16(int16_t a, int16_t b) { return vqrshlh_s16(a, b); } -// CHECK-LABEL: @test_vqrshls_s32( -// CHECK: [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRSHLS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrshls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRSHLS_S32_I]] +// int32_t test_vqrshls_s32(int32_t a, int32_t b) { return vqrshls_s32(a, b); } -// CHECK-LABEL: @test_vqrshld_s64( -// CHECK: [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQRSHLD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqrshld_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQRSHLD_S64_I]] +// int64_t test_vqrshld_s64(int64_t a, int64_t b) { return vqrshld_s64(a, b); } -// CHECK-LABEL: @test_vqrshlb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vqrshlb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vqrshlb_u8(uint8_t a, int8_t b) { return vqrshlb_u8(a, b); } -// CHECK-LABEL: @test_vqrshlh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrshlh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vqrshlh_u16(uint16_t a, int16_t b) { return vqrshlh_u16(a, b); } -// CHECK-LABEL: @test_vqrshls_u32( -// CHECK: [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRSHLS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrshls_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRSHLS_U32_I]] +// uint32_t test_vqrshls_u32(uint32_t a, int32_t b) { return vqrshls_u32(a, b); } -// CHECK-LABEL: @test_vqrshld_u64( -// CHECK: [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VQRSHLD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqrshld_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VQRSHLD_U64_I]] +// uint64_t test_vqrshld_u64(uint64_t a, int64_t b) { return vqrshld_u64(a, b); } -// CHECK-LABEL: @test_vpaddd_s64( -// CHECK: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VPADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vpaddd_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VPADDD_S64_I]] +// int64_t test_vpaddd_s64(int64x2_t a) { return vpaddd_s64(a); } -// CHECK-LABEL: @test_vpadds_f32( -// CHECK: [[LANE0_I:%.*]] = extractelement <2 x float> %a, i64 0 -// CHECK: [[LANE1_I:%.*]] = extractelement <2 x float> %a, i64 1 -// CHECK: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] -// CHECK: ret float [[VPADDD_I]] +// CHECK-LABEL: define dso_local float @test_vpadds_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <2 x float> [[A]], i64 0 +// CHECK-NEXT: [[LANE1_I:%.*]] = extractelement <2 x float> [[A]], i64 1 +// CHECK-NEXT: [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]] +// CHECK-NEXT: ret float [[VPADDD_I]] +// float32_t test_vpadds_f32(float32x2_t a) { return vpadds_f32(a); } -// CHECK-LABEL: @test_vpaddd_f64( -// CHECK: [[LANE0_I:%.*]] = extractelement <2 x double> %a, i64 0 -// CHECK: [[LANE1_I:%.*]] = extractelement <2 x double> %a, i64 1 -// CHECK: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] -// CHECK: ret double [[VPADDD_I]] +// CHECK-LABEL: define dso_local double @test_vpaddd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <2 x double> [[A]], i64 0 +// CHECK-NEXT: [[LANE1_I:%.*]] = extractelement <2 x double> [[A]], i64 1 +// CHECK-NEXT: [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]] +// CHECK-NEXT: ret double [[VPADDD_I]] +// float64_t test_vpaddd_f64(float64x2_t a) { return vpaddd_f64(a); } -// CHECK-LABEL: @test_vpmaxnms_f32( -// CHECK: [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMAXNMS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmaxnms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMAXNMS_F32_I]] +// float32_t test_vpmaxnms_f32(float32x2_t a) { return vpmaxnms_f32(a); } -// CHECK-LABEL: @test_vpmaxnmqd_f64( -// CHECK: [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMAXNMQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpmaxnmqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMAXNMQD_F64_I]] +// float64_t test_vpmaxnmqd_f64(float64x2_t a) { return vpmaxnmqd_f64(a); } -// CHECK-LABEL: @test_vpmaxs_f32( -// CHECK: [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMAXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmaxs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMAXS_F32_I]] +// float32_t test_vpmaxs_f32(float32x2_t a) { return vpmaxs_f32(a); } -// CHECK-LABEL: @test_vpmaxqd_f64( -// CHECK: [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMAXQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpmaxqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMAXQD_F64_I]] +// float64_t test_vpmaxqd_f64(float64x2_t a) { return vpmaxqd_f64(a); } -// CHECK-LABEL: @test_vpminnms_f32( -// CHECK: [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMINNMS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpminnms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMINNMS_F32_I]] +// float32_t test_vpminnms_f32(float32x2_t a) { return vpminnms_f32(a); } -// CHECK-LABEL: @test_vpminnmqd_f64( -// CHECK: [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMINNMQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpminnmqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMINNMQD_F64_I]] +// float64_t test_vpminnmqd_f64(float64x2_t a) { return vpminnmqd_f64(a); } -// CHECK-LABEL: @test_vpmins_f32( -// CHECK: [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VPMINS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vpmins_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VPMINS_F32_I]] +// float32_t test_vpmins_f32(float32x2_t a) { return vpmins_f32(a); } -// CHECK-LABEL: @test_vpminqd_f64( -// CHECK: [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VPMINQD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vpminqd_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VPMINQD_F64_I]] +// float64_t test_vpminqd_f64(float64x2_t a) { return vpminqd_f64(a); } -// CHECK-LABEL: @test_vqdmulhh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_s16(int16_t a, int16_t b) { return vqdmulhh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhs_s32( -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_s32(int32_t a, int32_t b) { return vqdmulhs_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) { return vqrdmulhh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhs_s32( -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) { return vqrdmulhs_s32(a, b); } -// CHECK-LABEL: @test_vmulxs_f32( -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_f32(float32_t a, float32_t b) { return vmulxs_f32(a, b); } -// CHECK-LABEL: @test_vmulxd_f64( -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_f64(float64_t a, float64_t b) { return vmulxd_f64(a, b); } -// CHECK-LABEL: @test_vmulx_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMULX2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VMULX2_I]] +// float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) { return vmulx_f64(a, b); } -// CHECK-LABEL: @test_vrecpss_f32( -// CHECK: [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) -// CHECK: ret float [[VRECPS_I]] +// CHECK-LABEL: define dso_local float @test_vrecpss_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VRECPS_I]] +// float32_t test_vrecpss_f32(float32_t a, float32_t b) { return vrecpss_f32(a, b); } -// CHECK-LABEL: @test_vrecpsd_f64( -// CHECK: [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) -// CHECK: ret double [[VRECPS_I]] +// CHECK-LABEL: define dso_local double @test_vrecpsd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VRECPS_I]] +// float64_t test_vrecpsd_f64(float64_t a, float64_t b) { return vrecpsd_f64(a, b); } -// CHECK-LABEL: @test_vrsqrtss_f32( -// CHECK: [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) -// CHECK: ret float [[VRSQRTSS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrsqrtss_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VRSQRTSS_F32_I]] +// float32_t test_vrsqrtss_f32(float32_t a, float32_t b) { return vrsqrtss_f32(a, b); } -// CHECK-LABEL: @test_vrsqrtsd_f64( -// CHECK: [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) -// CHECK: ret double [[VRSQRTSD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrsqrtsd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VRSQRTSD_F64_I]] +// float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) { return vrsqrtsd_f64(a, b); } -// CHECK-LABEL: @test_vcvts_f32_s32( -// CHECK: [[TMP0:%.*]] = sitofp i32 %a to float -// CHECK: ret float [[TMP0]] +// CHECK-LABEL: define dso_local float @test_vcvts_f32_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sitofp i32 [[A]] to float +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_s32(int32_t a) { return vcvts_f32_s32(a); } -// CHECK-LABEL: @test_vcvtd_f64_s64( -// CHECK: [[TMP0:%.*]] = sitofp i64 %a to double -// CHECK: ret double [[TMP0]] +// CHECK-LABEL: define dso_local double @test_vcvtd_f64_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sitofp i64 [[A]] to double +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_s64(int64_t a) { return vcvtd_f64_s64(a); } -// CHECK-LABEL: @test_vcvts_f32_u32( -// CHECK: [[TMP0:%.*]] = uitofp i32 %a to float -// CHECK: ret float [[TMP0]] +// CHECK-LABEL: define dso_local float @test_vcvts_f32_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = uitofp i32 [[A]] to float +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vcvts_f32_u32(uint32_t a) { return vcvts_f32_u32(a); } -// CHECK-LABEL: @test_vcvtd_f64_u64( -// CHECK: [[TMP0:%.*]] = uitofp i64 %a to double -// CHECK: ret double [[TMP0]] +// CHECK-LABEL: define dso_local double @test_vcvtd_f64_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = uitofp i64 [[A]] to double +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vcvtd_f64_u64(uint64_t a) { return vcvtd_f64_u64(a); } -// CHECK-LABEL: @test_vrecpes_f32( -// CHECK: [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) -// CHECK: ret float [[VRECPES_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrecpes_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRECPES_F32_I]] +// float32_t test_vrecpes_f32(float32_t a) { return vrecpes_f32(a); } -// CHECK-LABEL: @test_vrecped_f64( -// CHECK: [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) -// CHECK: ret double [[VRECPED_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrecped_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRECPED_F64_I]] +// float64_t test_vrecped_f64(float64_t a) { return vrecped_f64(a); } -// CHECK-LABEL: @test_vrecpxs_f32( -// CHECK: [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) -// CHECK: ret float [[VRECPXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrecpxs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRECPXS_F32_I]] +// float32_t test_vrecpxs_f32(float32_t a) { return vrecpxs_f32(a); } -// CHECK-LABEL: @test_vrecpxd_f64( -// CHECK: [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) -// CHECK: ret double [[VRECPXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrecpxd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRECPXD_F64_I]] +// float64_t test_vrecpxd_f64(float64_t a) { return vrecpxd_f64(a); } -// CHECK-LABEL: @test_vrsqrte_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrsqrte_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VRSQRTE_V1_I]] +// uint32x2_t test_vrsqrte_u32(uint32x2_t a) { return vrsqrte_u32(a); } -// CHECK-LABEL: @test_vrsqrteq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrsqrteq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { return vrsqrteq_u32(a); } -// CHECK-LABEL: @test_vrsqrtes_f32( -// CHECK: [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) -// CHECK: ret float [[VRSQRTES_F32_I]] +// CHECK-LABEL: define dso_local float @test_vrsqrtes_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float [[A]]) +// CHECK-NEXT: ret float [[VRSQRTES_F32_I]] +// float32_t test_vrsqrtes_f32(float32_t a) { return vrsqrtes_f32(a); } -// CHECK-LABEL: @test_vrsqrted_f64( -// CHECK: [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) -// CHECK: ret double [[VRSQRTED_F64_I]] +// CHECK-LABEL: define dso_local double @test_vrsqrted_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double [[A]]) +// CHECK-NEXT: ret double [[VRSQRTED_F64_I]] +// float64_t test_vrsqrted_f64(float64_t a) { return vrsqrted_f64(a); } -// CHECK-LABEL: @test_vld1q_u8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vld1q_u8(uint8_t const *a) { return vld1q_u8(a); } -// CHECK-LABEL: @test_vld1q_u16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vld1q_u16(uint16_t const *a) { return vld1q_u16(a); } -// CHECK-LABEL: @test_vld1q_u32( -// CHECK: [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4 -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vld1q_u32(uint32_t const *a) { return vld1q_u32(a); } -// CHECK-LABEL: @test_vld1q_u64( -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8 -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vld1q_u64(uint64_t const *a) { return vld1q_u64(a); } -// CHECK-LABEL: @test_vld1q_s8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vld1q_s8(int8_t const *a) { return vld1q_s8(a); } -// CHECK-LABEL: @test_vld1q_s16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vld1q_s16(int16_t const *a) { return vld1q_s16(a); } -// CHECK-LABEL: @test_vld1q_s32( -// CHECK: [[TMP2:%.*]] = load <4 x i32>, ptr %a, align 4 -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vld1q_s32(int32_t const *a) { return vld1q_s32(a); } -// CHECK-LABEL: @test_vld1q_s64( -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %a, align 8 -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vld1q_s64(int64_t const *a) { return vld1q_s64(a); } -// CHECK-LABEL: @test_vld1q_f16( -// CHECK: [[TMP2:%.*]] = load <8 x half>, ptr %a, align 2 -// CHECK: ret <8 x half> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vld1q_f16(float16_t const *a) { return vld1q_f16(a); } -// CHECK-LABEL: @test_vld1q_f32( -// CHECK: [[TMP2:%.*]] = load <4 x float>, ptr %a, align 4 -// CHECK: ret <4 x float> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vld1q_f32(float32_t const *a) { return vld1q_f32(a); } -// CHECK-LABEL: @test_vld1q_f64( -// CHECK: [[TMP2:%.*]] = load <2 x double>, ptr %a, align 8 -// CHECK: ret <2 x double> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vld1q_f64(float64_t const *a) { return vld1q_f64(a); } -// CHECK-LABEL: @test_vld1q_p8( -// CHECK: [[TMP1:%.*]] = load <16 x i8>, ptr %a, align 1 -// CHECK: ret <16 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vld1q_p8(poly8_t const *a) { return vld1q_p8(a); } -// CHECK-LABEL: @test_vld1q_p16( -// CHECK: [[TMP2:%.*]] = load <8 x i16>, ptr %a, align 2 -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vld1q_p16(poly16_t const *a) { return vld1q_p16(a); } -// CHECK-LABEL: @test_vld1_u8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vld1_u8(uint8_t const *a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vld1_u16(uint16_t const *a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32( -// CHECK: [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4 -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vld1_u32(uint32_t const *a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64( -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8 -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vld1_u64(uint64_t const *a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vld1_s8(int8_t const *a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vld1_s16(int16_t const *a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32( -// CHECK: [[TMP2:%.*]] = load <2 x i32>, ptr %a, align 4 -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vld1_s32(int32_t const *a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64( -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %a, align 8 -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vld1_s64(int64_t const *a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16( -// CHECK: [[TMP2:%.*]] = load <4 x half>, ptr %a, align 2 -// CHECK: ret <4 x half> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vld1_f16(float16_t const *a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32( -// CHECK: [[TMP2:%.*]] = load <2 x float>, ptr %a, align 4 -// CHECK: ret <2 x float> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vld1_f32(float32_t const *a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_f64( -// CHECK: [[TMP2:%.*]] = load <1 x double>, ptr %a, align 8 -// CHECK: ret <1 x double> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 8 +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vld1_f64(float64_t const *a) { return vld1_f64(a); } -// CHECK-LABEL: @test_vld1_p8( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vld1_p8(poly8_t const *a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16( -// CHECK: [[TMP2:%.*]] = load <4 x i16>, ptr %a, align 2 -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vld1_p16(poly16_t const *a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld1_u8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_u8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vld1_u8_void(void *a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_u16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vld1_u16_void(void *a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32_void( -// CHECK: [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1 -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_u32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vld1_u32_void(void *a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64_void( -// CHECK: [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1 -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_u64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vld1_u64_void(void *a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_s8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vld1_s8_void(void *a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_s16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vld1_s16_void(void *a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32_void( -// CHECK: [[TMP1:%.*]] = load <2 x i32>, ptr %a, align 1 -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_s32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vld1_s32_void(void *a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64_void( -// CHECK: [[TMP1:%.*]] = load <1 x i64>, ptr %a, align 1 -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_s64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vld1_s64_void(void *a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16_void( -// CHECK: [[TMP1:%.*]] = load <4 x half>, ptr %a, align 1 -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_f16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vld1_f16_void(void *a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32_void( -// CHECK: [[TMP1:%.*]] = load <2 x float>, ptr %a, align 1 -// CHECK: ret <2 x float> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_f32_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 1 +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vld1_f32_void(void *a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_f64_void( -// CHECK: [[TMP1:%.*]] = load <1 x double>, ptr %a, align 1 -// CHECK: ret <1 x double> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_f64_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[A]], align 1 +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vld1_f64_void(void *a) { return vld1_f64(a); } -// CHECK-LABEL: @test_vld1_p8_void( -// CHECK: [[TMP1:%.*]] = load <8 x i8>, ptr %a, align 1 -// CHECK: ret <8 x i8> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_p8_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vld1_p8_void(void *a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16_void( -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr %a, align 1 -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_p16_void( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 1 +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vld1_p16_void(void *a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld2q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[TMP1]] +// uint8x16x2_t test_vld2q_u8(uint8_t const *a) { return vld2q_u8(a); } -// CHECK-LABEL: @test_vld2q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[TMP1]] +// uint16x8x2_t test_vld2q_u16(uint16_t const *a) { return vld2q_u16(a); } -// CHECK-LABEL: @test_vld2q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[TMP1]] +// uint32x4x2_t test_vld2q_u32(uint32_t const *a) { return vld2q_u32(a); } -// CHECK-LABEL: @test_vld2q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[TMP1]] +// uint64x2x2_t test_vld2q_u64(uint64_t const *a) { return vld2q_u64(a); } -// CHECK-LABEL: @test_vld2q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[TMP1]] +// int8x16x2_t test_vld2q_s8(int8_t const *a) { return vld2q_s8(a); } -// CHECK-LABEL: @test_vld2q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[TMP1]] +// int16x8x2_t test_vld2q_s16(int16_t const *a) { return vld2q_s16(a); } -// CHECK-LABEL: @test_vld2q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[TMP1]] +// int32x4x2_t test_vld2q_s32(int32_t const *a) { return vld2q_s32(a); } -// CHECK-LABEL: @test_vld2q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[TMP1]] +// int64x2x2_t test_vld2q_s64(int64_t const *a) { return vld2q_s64(a); } -// CHECK-LABEL: @test_vld2q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x half> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, [2 x <8 x half>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]] +// float16x8x2_t test_vld2q_f16(float16_t const *a) { return vld2q_f16(a); } -// CHECK-LABEL: @test_vld2q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x float> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, [2 x <4 x float>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]] +// float32x4x2_t test_vld2q_f32(float32_t const *a) { return vld2q_f32(a); } -// CHECK-LABEL: @test_vld2q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x double> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]] +// float64x2x2_t test_vld2q_f64(float64_t const *a) { return vld2q_f64(a); } -// CHECK-LABEL: @test_vld2q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[TMP1]] +// poly8x16x2_t test_vld2q_p8(poly8_t const *a) { return vld2q_p8(a); } -// CHECK-LABEL: @test_vld2q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[TMP1]] +// poly16x8x2_t test_vld2q_p16(poly16_t const *a) { return vld2q_p16(a); } -// CHECK-LABEL: @test_vld2_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[TMP1]] +// uint8x8x2_t test_vld2_u8(uint8_t const *a) { return vld2_u8(a); } -// CHECK-LABEL: @test_vld2_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[TMP1]] +// uint16x4x2_t test_vld2_u16(uint16_t const *a) { return vld2_u16(a); } -// CHECK-LABEL: @test_vld2_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[TMP1]] +// uint32x2x2_t test_vld2_u32(uint32_t const *a) { return vld2_u32(a); } -// CHECK-LABEL: @test_vld2_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X2_T]] [[TMP1]] +// uint64x1x2_t test_vld2_u64(uint64_t const *a) { return vld2_u64(a); } -// CHECK-LABEL: @test_vld2_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[TMP1]] +// int8x8x2_t test_vld2_s8(int8_t const *a) { return vld2_s8(a); } -// CHECK-LABEL: @test_vld2_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[TMP1]] +// int16x4x2_t test_vld2_s16(int16_t const *a) { return vld2_s16(a); } -// CHECK-LABEL: @test_vld2_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[TMP1]] +// int32x2x2_t test_vld2_s32(int32_t const *a) { return vld2_s32(a); } -// CHECK-LABEL: @test_vld2_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X2_T]] [[TMP1]] +// int64x1x2_t test_vld2_s64(int64_t const *a) { return vld2_s64(a); } -// CHECK-LABEL: @test_vld2_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x half> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, [2 x <4 x half>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]] +// float16x4x2_t test_vld2_f16(float16_t const *a) { return vld2_f16(a); } -// CHECK-LABEL: @test_vld2_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x float> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, [2 x <2 x float>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]] +// float32x2x2_t test_vld2_f32(float32_t const *a) { return vld2_f32(a); } -// CHECK-LABEL: @test_vld2_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x double> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]] +// float64x1x2_t test_vld2_f64(float64_t const *a) { return vld2_f64(a); } -// CHECK-LABEL: @test_vld2_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[TMP1]] +// poly8x8x2_t test_vld2_p8(poly8_t const *a) { return vld2_p8(a); } -// CHECK-LABEL: @test_vld2_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[TMP1]] +// poly16x4x2_t test_vld2_p16(poly16_t const *a) { return vld2_p16(a); } -// CHECK-LABEL: @test_vld3q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X3_T]] [[TMP2]] +// uint8x16x3_t test_vld3q_u8(uint8_t const *a) { return vld3q_u8(a); } -// CHECK-LABEL: @test_vld3q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X3_T]] [[TMP2]] +// uint16x8x3_t test_vld3q_u16(uint16_t const *a) { return vld3q_u16(a); } -// CHECK-LABEL: @test_vld3q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X3_T]] [[TMP2]] +// uint32x4x3_t test_vld3q_u32(uint32_t const *a) { return vld3q_u32(a); } -// CHECK-LABEL: @test_vld3q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[TMP2]] +// uint64x2x3_t test_vld3q_u64(uint64_t const *a) { return vld3q_u64(a); } -// CHECK-LABEL: @test_vld3q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X3_T]] [[TMP2]] +// int8x16x3_t test_vld3q_s8(int8_t const *a) { return vld3q_s8(a); } -// CHECK-LABEL: @test_vld3q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X3_T]] [[TMP2]] +// int16x8x3_t test_vld3q_s16(int16_t const *a) { return vld3q_s16(a); } -// CHECK-LABEL: @test_vld3q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X3_T]] [[TMP2]] +// int32x4x3_t test_vld3q_s32(int32_t const *a) { return vld3q_s32(a); } -// CHECK-LABEL: @test_vld3q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[TMP2]] +// int64x2x3_t test_vld3q_s64(int64_t const *a) { return vld3q_s64(a); } -// CHECK-LABEL: @test_vld3q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x half> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x half> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x half>] [[TMP1]], <8 x half> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] poison, [3 x <8 x half>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X3_T]] [[TMP2]] +// float16x8x3_t test_vld3q_f16(float16_t const *a) { return vld3q_f16(a); } -// CHECK-LABEL: @test_vld3q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x float> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x float> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x float>] [[TMP1]], <4 x float> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] poison, [3 x <4 x float>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X3_T]] [[TMP2]] +// float32x4x3_t test_vld3q_f32(float32_t const *a) { return vld3q_f32(a); } -// CHECK-LABEL: @test_vld3q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]] +// float64x2x3_t test_vld3q_f64(float64_t const *a) { return vld3q_f64(a); } -// CHECK-LABEL: @test_vld3q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X3_T]] [[TMP2]] +// poly8x16x3_t test_vld3q_p8(poly8_t const *a) { return vld3q_p8(a); } -// CHECK-LABEL: @test_vld3q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X3_T]] [[TMP2]] +// poly16x8x3_t test_vld3q_p16(poly16_t const *a) { return vld3q_p16(a); } -// CHECK-LABEL: @test_vld3_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X3_T]] [[TMP2]] +// uint8x8x3_t test_vld3_u8(uint8_t const *a) { return vld3_u8(a); } -// CHECK-LABEL: @test_vld3_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X3_T]] [[TMP2]] +// uint16x4x3_t test_vld3_u16(uint16_t const *a) { return vld3_u16(a); } -// CHECK-LABEL: @test_vld3_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X3_T]] [[TMP2]] +// uint32x2x3_t test_vld3_u32(uint32_t const *a) { return vld3_u32(a); } -// CHECK-LABEL: @test_vld3_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X3_T]] [[TMP2]] +// uint64x1x3_t test_vld3_u64(uint64_t const *a) { return vld3_u64(a); } -// CHECK-LABEL: @test_vld3_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X3_T]] [[TMP2]] +// int8x8x3_t test_vld3_s8(int8_t const *a) { return vld3_s8(a); } -// CHECK-LABEL: @test_vld3_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X3_T]] [[TMP2]] +// int16x4x3_t test_vld3_s16(int16_t const *a) { return vld3_s16(a); } -// CHECK-LABEL: @test_vld3_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X3_T]] [[TMP2]] +// int32x2x3_t test_vld3_s32(int32_t const *a) { return vld3_s32(a); } -// CHECK-LABEL: @test_vld3_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X3_T]] [[TMP2]] +// int64x1x3_t test_vld3_s64(int64_t const *a) { return vld3_s64(a); } -// CHECK-LABEL: @test_vld3_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x half> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x half> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x half>] [[TMP1]], <4 x half> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] poison, [3 x <4 x half>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X3_T]] [[TMP2]] +// float16x4x3_t test_vld3_f16(float16_t const *a) { return vld3_f16(a); } -// CHECK-LABEL: @test_vld3_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x float> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x float> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x float>] [[TMP1]], <2 x float> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] poison, [3 x <2 x float>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X3_T]] [[TMP2]] +// float32x2x3_t test_vld3_f32(float32_t const *a) { return vld3_f32(a); } -// CHECK-LABEL: @test_vld3_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]] +// float64x1x3_t test_vld3_f64(float64_t const *a) { return vld3_f64(a); } -// CHECK-LABEL: @test_vld3_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x3_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X3_T]] [[TMP2]] +// poly8x8x3_t test_vld3_p8(poly8_t const *a) { return vld3_p8(a); } -// CHECK-LABEL: @test_vld3_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X3_T]] [[TMP2]] +// poly16x4x3_t test_vld3_p16(poly16_t const *a) { return vld3_p16(a); } -// CHECK-LABEL: @test_vld4q_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] [[TMP3]] +// uint8x16x4_t test_vld4q_u8(uint8_t const *a) { return vld4q_u8(a); } -// CHECK-LABEL: @test_vld4q_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X4_T]] [[TMP3]] +// uint16x8x4_t test_vld4q_u16(uint16_t const *a) { return vld4q_u16(a); } -// CHECK-LABEL: @test_vld4q_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X4_T]] [[TMP3]] +// uint32x4x4_t test_vld4q_u32(uint32_t const *a) { return vld4q_u32(a); } -// CHECK-LABEL: @test_vld4q_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[TMP3]] +// uint64x2x4_t test_vld4q_u64(uint64_t const *a) { return vld4q_u64(a); } -// CHECK-LABEL: @test_vld4q_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X4_T]] [[TMP3]] +// int8x16x4_t test_vld4q_s8(int8_t const *a) { return vld4q_s8(a); } -// CHECK-LABEL: @test_vld4q_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X4_T]] [[TMP3]] +// int16x8x4_t test_vld4q_s16(int16_t const *a) { return vld4q_s16(a); } -// CHECK-LABEL: @test_vld4q_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x i32>] [[TMP0]], <4 x i32> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x i32>] [[TMP1]], <4 x i32> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X4_T]] [[TMP3]] +// int32x4x4_t test_vld4q_s32(int32_t const *a) { return vld4q_s32(a); } -// CHECK-LABEL: @test_vld4q_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[TMP3]] +// int64x2x4_t test_vld4q_s64(int64_t const *a) { return vld4q_s64(a); } -// CHECK-LABEL: @test_vld4q_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x half> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x half> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x half> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x half>] [[TMP0]], <8 x half> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x half>] [[TMP1]], <8 x half> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x half>] [[TMP2]], <8 x half> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] poison, [4 x <8 x half>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X4_T]] [[TMP3]] +// float16x8x4_t test_vld4q_f16(float16_t const *a) { return vld4q_f16(a); } -// CHECK-LABEL: @test_vld4q_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x float> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x float> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x float> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x float>] [[TMP0]], <4 x float> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x float>] [[TMP1]], <4 x float> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x float>] [[TMP2]], <4 x float> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] poison, [4 x <4 x float>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X4_T]] [[TMP3]] +// float32x4x4_t test_vld4q_f32(float32_t const *a) { return vld4q_f32(a); } -// CHECK-LABEL: @test_vld4q_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]] +// float64x2x4_t test_vld4q_f64(float64_t const *a) { return vld4q_f64(a); } -// CHECK-LABEL: @test_vld4q_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <16 x i8>] [[TMP0]], <16 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <16 x i8>] [[TMP1]], <16 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X4_T]] [[TMP3]] +// poly8x16x4_t test_vld4q_p8(poly8_t const *a) { return vld4q_p8(a); } -// CHECK-LABEL: @test_vld4q_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i16>] [[TMP0]], <8 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i16>] [[TMP1]], <8 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X4_T]] [[TMP3]] +// poly16x8x4_t test_vld4q_p16(poly16_t const *a) { return vld4q_p16(a); } -// CHECK-LABEL: @test_vld4_u8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X4_T]] [[TMP3]] +// uint8x8x4_t test_vld4_u8(uint8_t const *a) { return vld4_u8(a); } -// CHECK-LABEL: @test_vld4_u16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X4_T]] [[TMP3]] +// uint16x4x4_t test_vld4_u16(uint16_t const *a) { return vld4_u16(a); } -// CHECK-LABEL: @test_vld4_u32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X4_T]] [[TMP3]] +// uint32x2x4_t test_vld4_u32(uint32_t const *a) { return vld4_u32(a); } -// CHECK-LABEL: @test_vld4_u64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X4_T]] [[TMP3]] +// uint64x1x4_t test_vld4_u64(uint64_t const *a) { return vld4_u64(a); } -// CHECK-LABEL: @test_vld4_s8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X4_T]] [[TMP3]] +// int8x8x4_t test_vld4_s8(int8_t const *a) { return vld4_s8(a); } -// CHECK-LABEL: @test_vld4_s16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X4_T]] [[TMP3]] +// int16x4x4_t test_vld4_s16(int16_t const *a) { return vld4_s16(a); } -// CHECK-LABEL: @test_vld4_s32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i32>] [[TMP0]], <2 x i32> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i32>] [[TMP1]], <2 x i32> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X4_T]] [[TMP3]] +// int32x2x4_t test_vld4_s32(int32_t const *a) { return vld4_s32(a); } -// CHECK-LABEL: @test_vld4_s64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X4_T]] [[TMP3]] +// int64x1x4_t test_vld4_s64(int64_t const *a) { return vld4_s64(a); } -// CHECK-LABEL: @test_vld4_f16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x half> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x half> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x half> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x half>] [[TMP0]], <4 x half> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x half>] [[TMP1]], <4 x half> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x half>] [[TMP2]], <4 x half> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] poison, [4 x <4 x half>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X4_T]] [[TMP3]] +// float16x4x4_t test_vld4_f16(float16_t const *a) { return vld4_f16(a); } -// CHECK-LABEL: @test_vld4_f32( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x float> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x float> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x float> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x float>] [[TMP0]], <2 x float> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x float>] [[TMP1]], <2 x float> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x float>] [[TMP2]], <2 x float> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] poison, [4 x <2 x float>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X4_T]] [[TMP3]] +// float32x2x4_t test_vld4_f32(float32_t const *a) { return vld4_f32(a); } -// CHECK-LABEL: @test_vld4_f64( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]] +// float64x1x4_t test_vld4_f64(float64_t const *a) { return vld4_f64(a); } -// CHECK-LABEL: @test_vld4_p8( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x4_t [[TMP5]] +// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <8 x i8>] [[TMP0]], <8 x i8> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <8 x i8>] [[TMP1]], <8 x i8> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X4_T]] [[TMP3]] +// poly8x8x4_t test_vld4_p8(poly8_t const *a) { return vld4_p8(a); } -// CHECK-LABEL: @test_vld4_p16( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <4 x i16>] [[TMP0]], <4 x i16> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <4 x i16>] [[TMP1]], <4 x i16> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X4_T]] [[TMP3]] +// poly16x4x4_t test_vld4_p16(poly16_t const *a) { return vld4_p16(a); } -// CHECK-LABEL: @test_vst1q_u8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_u8(uint8_t *a, uint8x16_t b) { vst1q_u8(a, b); } -// CHECK-LABEL: @test_vst1q_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_u16(uint16_t *a, uint16x8_t b) { vst1q_u16(a, b); } -// CHECK-LABEL: @test_vst1q_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: store <4 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x i32> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_u32(uint32_t *a, uint32x4_t b) { vst1q_u32(a, b); } -// CHECK-LABEL: @test_vst1q_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_u64(uint64_t *a, uint64x2_t b) { vst1q_u64(a, b); } -// CHECK-LABEL: @test_vst1q_s8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_s8(int8_t *a, int8x16_t b) { vst1q_s8(a, b); } -// CHECK-LABEL: @test_vst1q_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_s16(int16_t *a, int16x8_t b) { vst1q_s16(a, b); } -// CHECK-LABEL: @test_vst1q_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: store <4 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x i32> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_s32(int32_t *a, int32x4_t b) { vst1q_s32(a, b); } -// CHECK-LABEL: @test_vst1q_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_s64(int64_t *a, int64x2_t b) { vst1q_s64(a, b); } -// CHECK-LABEL: @test_vst1q_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: store <8 x half> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x half> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_f16(float16_t *a, float16x8_t b) { vst1q_f16(a, b); } -// CHECK-LABEL: @test_vst1q_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: store <4 x float> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x float> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_f32(float32_t *a, float32x4_t b) { vst1q_f32(a, b); } -// CHECK-LABEL: @test_vst1q_f64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: store <2 x double> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x double> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_f64(float64_t *a, float64x2_t b) { vst1q_f64(a, b); } -// CHECK-LABEL: @test_vst1q_p8( -// CHECK: store <16 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_p8(poly8_t *a, poly8x16_t b) { vst1q_p8(a, b); } -// CHECK-LABEL: @test_vst1q_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: store <8 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_p16(poly16_t *a, poly16x8_t b) { vst1q_p16(a, b); } -// CHECK-LABEL: @test_vst1_u8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_u8(uint8_t *a, uint8x8_t b) { vst1_u8(a, b); } -// CHECK-LABEL: @test_vst1_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_u16(uint16_t *a, uint16x4_t b) { vst1_u16(a, b); } -// CHECK-LABEL: @test_vst1_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: store <2 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x i32> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_u32(uint32_t *a, uint32x2_t b) { vst1_u32(a, b); } -// CHECK-LABEL: @test_vst1_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <1 x i64> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_u64(uint64_t *a, uint64x1_t b) { vst1_u64(a, b); } -// CHECK-LABEL: @test_vst1_s8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_s8(int8_t *a, int8x8_t b) { vst1_s8(a, b); } -// CHECK-LABEL: @test_vst1_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_s16(int16_t *a, int16x4_t b) { vst1_s16(a, b); } -// CHECK-LABEL: @test_vst1_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: store <2 x i32> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x i32> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_s32(int32_t *a, int32x2_t b) { vst1_s32(a, b); } -// CHECK-LABEL: @test_vst1_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <1 x i64> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_s64(int64_t *a, int64x1_t b) { vst1_s64(a, b); } -// CHECK-LABEL: @test_vst1_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: store <4 x half> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x half> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_f16(float16_t *a, float16x4_t b) { vst1_f16(a, b); } -// CHECK-LABEL: @test_vst1_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: store <2 x float> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x float> [[B]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_f32(float32_t *a, float32x2_t b) { vst1_f32(a, b); } -// CHECK-LABEL: @test_vst1_f64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: store <1 x double> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <1 x double> [[B]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_f64(float64_t *a, float64x1_t b) { vst1_f64(a, b); } -// CHECK-LABEL: @test_vst1_p8( -// CHECK: store <8 x i8> %b, ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_p8(poly8_t *a, poly8x8_t b) { vst1_p8(a, b); } -// CHECK-LABEL: @test_vst1_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: store <4 x i16> [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <4 x i16> [[B]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_p16(poly16_t *a, poly16x4_t b) { vst1_p16(a, b); } -// CHECK-LABEL: @test_vst2q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) { vst2q_u8(a, b); } -// CHECK-LABEL: @test_vst2q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) { vst2q_u16(a, b); } -// CHECK-LABEL: @test_vst2q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) { vst2q_u32(a, b); } -// CHECK-LABEL: @test_vst2q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) { vst2q_u64(a, b); } -// CHECK-LABEL: @test_vst2q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s8(int8_t *a, int8x16x2_t b) { vst2q_s8(a, b); } -// CHECK-LABEL: @test_vst2q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s16(int16_t *a, int16x8x2_t b) { vst2q_s16(a, b); } -// CHECK-LABEL: @test_vst2q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s32(int32_t *a, int32x4x2_t b) { vst2q_s32(a, b); } -// CHECK-LABEL: @test_vst2q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_s64(int64_t *a, int64x2x2_t b) { vst2q_s64(a, b); } -// CHECK-LABEL: @test_vst2q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f16(float16_t *a, float16x8x2_t b) { vst2q_f16(a, b); } -// CHECK-LABEL: @test_vst2q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f32(float32_t *a, float32x4x2_t b) { vst2q_f32(a, b); } -// CHECK-LABEL: @test_vst2q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_f64(float64_t *a, float64x2x2_t b) { vst2q_f64(a, b); } -// CHECK-LABEL: @test_vst2q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) { vst2q_p8(a, b); } -// CHECK-LABEL: @test_vst2q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) { vst2q_p16(a, b); } -// CHECK-LABEL: @test_vst2_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u8(uint8_t *a, uint8x8x2_t b) { vst2_u8(a, b); } -// CHECK-LABEL: @test_vst2_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u16(uint16_t *a, uint16x4x2_t b) { vst2_u16(a, b); } -// CHECK-LABEL: @test_vst2_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u32(uint32_t *a, uint32x2x2_t b) { vst2_u32(a, b); } -// CHECK-LABEL: @test_vst2_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_u64(uint64_t *a, uint64x1x2_t b) { vst2_u64(a, b); } -// CHECK-LABEL: @test_vst2_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s8(int8_t *a, int8x8x2_t b) { vst2_s8(a, b); } -// CHECK-LABEL: @test_vst2_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s16(int16_t *a, int16x4x2_t b) { vst2_s16(a, b); } -// CHECK-LABEL: @test_vst2_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s32(int32_t *a, int32x2x2_t b) { vst2_s32(a, b); } -// CHECK-LABEL: @test_vst2_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_s64(int64_t *a, int64x1x2_t b) { vst2_s64(a, b); } -// CHECK-LABEL: @test_vst2_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f16(float16_t *a, float16x4x2_t b) { vst2_f16(a, b); } -// CHECK-LABEL: @test_vst2_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f32(float32_t *a, float32x2x2_t b) { vst2_f32(a, b); } -// CHECK-LABEL: @test_vst2_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_f64(float64_t *a, float64x1x2_t b) { vst2_f64(a, b); } -// CHECK-LABEL: @test_vst2_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_p8(poly8_t *a, poly8x8x2_t b) { vst2_p8(a, b); } -// CHECK-LABEL: @test_vst2_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_p16(poly16_t *a, poly16x4x2_t b) { vst2_p16(a, b); } -// CHECK-LABEL: @test_vst3q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) { vst3q_u8(a, b); } -// CHECK-LABEL: @test_vst3q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) { vst3q_u16(a, b); } -// CHECK-LABEL: @test_vst3q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) { vst3q_u32(a, b); } -// CHECK-LABEL: @test_vst3q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) { vst3q_u64(a, b); } -// CHECK-LABEL: @test_vst3q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s8(int8_t *a, int8x16x3_t b) { vst3q_s8(a, b); } -// CHECK-LABEL: @test_vst3q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s16(int16_t *a, int16x8x3_t b) { vst3q_s16(a, b); } -// CHECK-LABEL: @test_vst3q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s32(int32_t *a, int32x4x3_t b) { vst3q_s32(a, b); } -// CHECK-LABEL: @test_vst3q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_s64(int64_t *a, int64x2x3_t b) { vst3q_s64(a, b); } -// CHECK-LABEL: @test_vst3q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f16(float16_t *a, float16x8x3_t b) { vst3q_f16(a, b); } -// CHECK-LABEL: @test_vst3q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f32(float32_t *a, float32x4x3_t b) { vst3q_f32(a, b); } -// CHECK-LABEL: @test_vst3q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_f64(float64_t *a, float64x2x3_t b) { vst3q_f64(a, b); } -// CHECK-LABEL: @test_vst3q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) { vst3q_p8(a, b); } -// CHECK-LABEL: @test_vst3q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) { vst3q_p16(a, b); } -// CHECK-LABEL: @test_vst3_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u8(uint8_t *a, uint8x8x3_t b) { vst3_u8(a, b); } -// CHECK-LABEL: @test_vst3_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u16(uint16_t *a, uint16x4x3_t b) { vst3_u16(a, b); } -// CHECK-LABEL: @test_vst3_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u32(uint32_t *a, uint32x2x3_t b) { vst3_u32(a, b); } -// CHECK-LABEL: @test_vst3_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_u64(uint64_t *a, uint64x1x3_t b) { vst3_u64(a, b); } -// CHECK-LABEL: @test_vst3_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s8(int8_t *a, int8x8x3_t b) { vst3_s8(a, b); } -// CHECK-LABEL: @test_vst3_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s16(int16_t *a, int16x4x3_t b) { vst3_s16(a, b); } -// CHECK-LABEL: @test_vst3_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s32(int32_t *a, int32x2x3_t b) { vst3_s32(a, b); } -// CHECK-LABEL: @test_vst3_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_s64(int64_t *a, int64x1x3_t b) { vst3_s64(a, b); } -// CHECK-LABEL: @test_vst3_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f16(float16_t *a, float16x4x3_t b) { vst3_f16(a, b); } -// CHECK-LABEL: @test_vst3_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f32(float32_t *a, float32x2x3_t b) { vst3_f32(a, b); } -// CHECK-LABEL: @test_vst3_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_f64(float64_t *a, float64x1x3_t b) { vst3_f64(a, b); } -// CHECK-LABEL: @test_vst3_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_p8(poly8_t *a, poly8x8x3_t b) { vst3_p8(a, b); } -// CHECK-LABEL: @test_vst3_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_p16(poly16_t *a, poly16x4x3_t b) { vst3_p16(a, b); } -// CHECK-LABEL: @test_vst4q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) { vst4q_u8(a, b); } -// CHECK-LABEL: @test_vst4q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) { vst4q_u16(a, b); } -// CHECK-LABEL: @test_vst4q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) { vst4q_u32(a, b); } -// CHECK-LABEL: @test_vst4q_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) { vst4q_u64(a, b); } -// CHECK-LABEL: @test_vst4q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s8(int8_t *a, int8x16x4_t b) { vst4q_s8(a, b); } -// CHECK-LABEL: @test_vst4q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s16(int16_t *a, int16x8x4_t b) { vst4q_s16(a, b); } -// CHECK-LABEL: @test_vst4q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s32(int32_t *a, int32x4x4_t b) { vst4q_s32(a, b); } -// CHECK-LABEL: @test_vst4q_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_s64(int64_t *a, int64x2x4_t b) { vst4q_s64(a, b); } -// CHECK-LABEL: @test_vst4q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f16(float16_t *a, float16x8x4_t b) { vst4q_f16(a, b); } -// CHECK-LABEL: @test_vst4q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f32(float32_t *a, float32x4x4_t b) { vst4q_f32(a, b); } -// CHECK-LABEL: @test_vst4q_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_f64(float64_t *a, float64x2x4_t b) { vst4q_f64(a, b); } -// CHECK-LABEL: @test_vst4q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) { vst4q_p8(a, b); } -// CHECK-LABEL: @test_vst4q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) { vst4q_p16(a, b); } -// CHECK-LABEL: @test_vst4_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u8(uint8_t *a, uint8x8x4_t b) { vst4_u8(a, b); } -// CHECK-LABEL: @test_vst4_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u16(uint16_t *a, uint16x4x4_t b) { vst4_u16(a, b); } -// CHECK-LABEL: @test_vst4_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u32(uint32_t *a, uint32x2x4_t b) { vst4_u32(a, b); } -// CHECK-LABEL: @test_vst4_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_u64(uint64_t *a, uint64x1x4_t b) { vst4_u64(a, b); } -// CHECK-LABEL: @test_vst4_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s8(int8_t *a, int8x8x4_t b) { vst4_s8(a, b); } -// CHECK-LABEL: @test_vst4_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s16(int16_t *a, int16x4x4_t b) { vst4_s16(a, b); } -// CHECK-LABEL: @test_vst4_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s32(int32_t *a, int32x2x4_t b) { vst4_s32(a, b); } -// CHECK-LABEL: @test_vst4_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_s64(int64_t *a, int64x1x4_t b) { vst4_s64(a, b); } -// CHECK-LABEL: @test_vst4_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f16(float16_t *a, float16x4x4_t b) { vst4_f16(a, b); } -// CHECK-LABEL: @test_vst4_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f32(float32_t *a, float32x2x4_t b) { vst4_f32(a, b); } -// CHECK-LABEL: @test_vst4_f64( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_f64(float64_t *a, float64x1x4_t b) { vst4_f64(a, b); } -// CHECK-LABEL: @test_vst4_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_p8(poly8_t *a, poly8x8x4_t b) { vst4_p8(a, b); } -// CHECK-LABEL: @test_vst4_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_p16(poly16_t *a, poly16x4x4_t b) { vst4_p16(a, b); } -// CHECK-LABEL: @test_vld1q_f64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld1q_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]] +// float64x2x2_t test_vld1q_f64_x2(float64_t const *a) { return vld1q_f64_x2(a); } -// CHECK-LABEL: @test_vld1q_p64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld1q_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[TMP1]] +// poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) { return vld1q_p64_x2(a); } -// CHECK-LABEL: @test_vld1_f64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld1_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]] +// float64x1x2_t test_vld1_f64_x2(float64_t const *a) { return vld1_f64_x2(a); } -// CHECK-LABEL: @test_vld1_p64_x2( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld1_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[TMP1]] +// poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) { return vld1_p64_x2(a); } -// CHECK-LABEL: @test_vld1q_f64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld1q_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]] +// float64x2x3_t test_vld1q_f64_x3(float64_t const *a) { return vld1q_f64_x3(a); } -// CHECK-LABEL: @test_vld1q_p64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld1q_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[TMP2]] +// poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) { return vld1q_p64_x3(a); } -// CHECK-LABEL: @test_vld1_f64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld1_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]] +// float64x1x3_t test_vld1_f64_x3(float64_t const *a) { return vld1_f64_x3(a); } -// CHECK-LABEL: @test_vld1_p64_x3( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld1_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[TMP2]] +// poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) { return vld1_p64_x3(a); } -// CHECK-LABEL: @test_vld1q_f64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld1q_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD1XN_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], 3 +// CHECK-NEXT: store <2 x double> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]] +// float64x2x4_t test_vld1q_f64_x4(float64_t const *a) { return vld1q_f64_x4(a); } -// CHECK-LABEL: @test_vld1q_p64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld1q_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD1XN_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[TMP3]] +// poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) { return vld1q_p64_x4(a); } -// CHECK-LABEL: @test_vld1_f64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld1_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 2 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD1XN_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], 3 +// CHECK-NEXT: store <1 x double> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]] +// float64x1x4_t test_vld1_f64_x4(float64_t const *a) { return vld1_f64_x4(a); } -// CHECK-LABEL: @test_vld1_p64_x4( -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld1_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD1XN_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD1XN_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD1XN_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD1XN_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD1XN_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[TMP3]] +// poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) { return vld1_p64_x4(a); } -// CHECK-LABEL: @test_vst1q_f64_x2( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) { vst1q_f64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) { vst1q_p64_x2(a, b); } -// CHECK-LABEL: @test_vst1_f64_x2( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) { vst1_f64_x2(a, b); } -// CHECK-LABEL: @test_vst1_p64_x2( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x2( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) { vst1_p64_x2(a, b); } -// CHECK-LABEL: @test_vst1q_f64_x3( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) { vst1q_f64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) { vst1q_p64_x3(a, b); } -// CHECK-LABEL: @test_vst1_f64_x3( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) { vst1_f64_x3(a, b); } -// CHECK-LABEL: @test_vst1_p64_x3( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x3( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) { vst1_p64_x3(a, b); } -// CHECK-LABEL: @test_vst1q_f64_x4( -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) { vst1q_f64_x4(a, b); } -// CHECK-LABEL: @test_vst1q_p64_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) { vst1q_p64_x4(a, b); } -// CHECK-LABEL: @test_vst1_f64_x4( -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_f64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) { vst1_f64_x4(a, b); } -// CHECK-LABEL: @test_vst1_p64_x4( -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64_x4( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st1x4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) { vst1_p64_x4(a, b); } -// CHECK-LABEL: @test_vceqd_s64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vceqd_s64(int64_t a, int64_t b) { return (uint64_t)vceqd_s64(a, b); } -// CHECK-LABEL: @test_vceqd_u64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vceqd_u64(uint64_t a, uint64_t b) { return (int64_t)vceqd_u64(a, b); } -// CHECK-LABEL: @test_vceqzd_s64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], 0 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_s64(int64_t a) { return (uint64_t)vceqzd_s64(a); } -// CHECK-LABEL: @test_vceqzd_u64( -// CHECK: [[TMP0:%.*]] = icmp eq i64 %a, 0 -// CHECK: [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[A]], 0 +// CHECK-NEXT: [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZD_I]] +// int64_t test_vceqzd_u64(int64_t a) { return (int64_t)vceqzd_u64(a); } -// CHECK-LABEL: @test_vcged_s64( -// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sge i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcged_s64(int64_t a, int64_t b) { return (uint64_t)vcged_s64(a, b); } -// CHECK-LABEL: @test_vcged_u64( -// CHECK: [[TMP0:%.*]] = icmp uge i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcged_u64(uint64_t a, uint64_t b) { return (uint64_t)vcged_u64(a, b); } -// CHECK-LABEL: @test_vcgezd_s64( -// CHECK: [[TMP0:%.*]] = icmp sge i64 %a, 0 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgezd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[A]], -1 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_s64(int64_t a) { return (uint64_t)vcgezd_s64(a); } -// CHECK-LABEL: @test_vcgtd_s64( -// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcgtd_s64(int64_t a, int64_t b) { return (uint64_t)vcgtd_s64(a, b); } -// CHECK-LABEL: @test_vcgtd_u64( -// CHECK: [[TMP0:%.*]] = icmp ugt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) { return (uint64_t)vcgtd_u64(a, b); } -// CHECK-LABEL: @test_vcgtzd_s64( -// CHECK: [[TMP0:%.*]] = icmp sgt i64 %a, 0 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[A]], 0 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_s64(int64_t a) { return (uint64_t)vcgtzd_s64(a); } -// CHECK-LABEL: @test_vcled_s64( -// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sle i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcled_s64(int64_t a, int64_t b) { return (uint64_t)vcled_s64(a, b); } -// CHECK-LABEL: @test_vcled_u64( -// CHECK: [[TMP0:%.*]] = icmp ule i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcled_u64(uint64_t a, uint64_t b) { return (uint64_t)vcled_u64(a, b); } -// CHECK-LABEL: @test_vclezd_s64( -// CHECK: [[TMP0:%.*]] = icmp sle i64 %a, 0 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vclezd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt i64 [[A]], 1 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_s64(int64_t a) { return (uint64_t)vclezd_s64(a); } -// CHECK-LABEL: @test_vcltd_s64( -// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcltd_s64(int64_t a, int64_t b) { return (uint64_t)vcltd_s64(a, b); } -// CHECK-LABEL: @test_vcltd_u64( -// CHECK: [[TMP0:%.*]] = icmp ult i64 %a, %b -// CHECK: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp ult i64 [[A]], [[B]] +// CHECK-NEXT: [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQD_I]] +// uint64_t test_vcltd_u64(uint64_t a, uint64_t b) { return (uint64_t)vcltd_u64(a, b); } -// CHECK-LABEL: @test_vcltzd_s64( -// CHECK: [[TMP0:%.*]] = icmp slt i64 %a, 0 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltzd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr i64 [[A]], 63 +// CHECK-NEXT: ret i64 [[A_LOBIT]] +// uint64_t test_vcltzd_s64(int64_t a) { return (uint64_t)vcltzd_s64(a); } -// CHECK-LABEL: @test_vtstd_s64( -// CHECK: [[TMP0:%.*]] = and i64 %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 -// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 -// CHECK: ret i64 [[VTSTD_I]] +// CHECK-LABEL: define dso_local i64 @test_vtstd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and i64 [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK-NEXT: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[VTSTD_I]] +// uint64_t test_vtstd_s64(int64_t a, int64_t b) { return (uint64_t)vtstd_s64(a, b); } -// CHECK-LABEL: @test_vtstd_u64( -// CHECK: [[TMP0:%.*]] = and i64 %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 -// CHECK: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 -// CHECK: ret i64 [[VTSTD_I]] +// CHECK-LABEL: define dso_local i64 @test_vtstd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and i64 [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0 +// CHECK-NEXT: [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[VTSTD_I]] +// uint64_t test_vtstd_u64(uint64_t a, uint64_t b) { return (uint64_t)vtstd_u64(a, b); } -// CHECK-LABEL: @test_vabsd_s64( -// CHECK: [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) -// CHECK: ret i64 [[VABSD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vabsd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VABSD_S64_I]] +// int64_t test_vabsd_s64(int64_t a) { return (int64_t)vabsd_s64(a); } -// CHECK-LABEL: @test_vqabsb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqabsb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqabsb_s8(int8_t a) { return (int8_t)vqabsb_s8(a); } -// CHECK-LABEL: @test_vqabsh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqabsh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqabsh_s16(int16_t a) { return (int16_t)vqabsh_s16(a); } -// CHECK-LABEL: @test_vqabss_s32( -// CHECK: [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) -// CHECK: ret i32 [[VQABSS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqabss_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 [[A]]) +// CHECK-NEXT: ret i32 [[VQABSS_S32_I]] +// int32_t test_vqabss_s32(int32_t a) { return (int32_t)vqabss_s32(a); } -// CHECK-LABEL: @test_vqabsd_s64( -// CHECK: [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) -// CHECK: ret i64 [[VQABSD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqabsd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VQABSD_S64_I]] +// int64_t test_vqabsd_s64(int64_t a) { return (int64_t)vqabsd_s64(a); } -// CHECK-LABEL: @test_vnegd_s64( -// CHECK: [[VNEGD_I:%.*]] = sub i64 0, %a -// CHECK: ret i64 [[VNEGD_I]] +// CHECK-LABEL: define dso_local i64 @test_vnegd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VNEGD_I:%.*]] = sub i64 0, [[A]] +// CHECK-NEXT: ret i64 [[VNEGD_I]] +// int64_t test_vnegd_s64(int64_t a) { return (int64_t)vnegd_s64(a); } -// CHECK-LABEL: @test_vqnegb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqnegb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqnegb_s8(int8_t a) { return (int8_t)vqnegb_s8(a); } -// CHECK-LABEL: @test_vqnegh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqnegh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqnegh_s16(int16_t a) { return (int16_t)vqnegh_s16(a); } -// CHECK-LABEL: @test_vqnegs_s32( -// CHECK: [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) -// CHECK: ret i32 [[VQNEGS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqnegs_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 [[A]]) +// CHECK-NEXT: ret i32 [[VQNEGS_S32_I]] +// int32_t test_vqnegs_s32(int32_t a) { return (int32_t)vqnegs_s32(a); } -// CHECK-LABEL: @test_vqnegd_s64( -// CHECK: [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) -// CHECK: ret i64 [[VQNEGD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vqnegd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 [[A]]) +// CHECK-NEXT: ret i64 [[VQNEGD_S64_I]] +// int64_t test_vqnegd_s64(int64_t a) { return (int64_t)vqnegd_s64(a); } -// CHECK-LABEL: @test_vuqaddb_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vuqaddb_s8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// int8_t test_vuqaddb_s8(int8_t a, uint8_t b) { return (int8_t)vuqaddb_s8(a, b); } -// CHECK-LABEL: @test_vuqaddh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vuqaddh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vuqaddh_s16(int16_t a, uint16_t b) { return (int16_t)vuqaddh_s16(a, b); } -// CHECK-LABEL: @test_vuqadds_s32( -// CHECK: [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VUQADDS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vuqadds_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VUQADDS_S32_I]] +// int32_t test_vuqadds_s32(int32_t a, uint32_t b) { return (int32_t)vuqadds_s32(a, b); } -// CHECK-LABEL: @test_vuqaddd_s64( -// CHECK: [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VUQADDD_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vuqaddd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VUQADDD_S64_I]] +// int64_t test_vuqaddd_s64(int64_t a, uint64_t b) { return (int64_t)vuqaddd_s64(a, b); } -// CHECK-LABEL: @test_vsqaddb_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 %b, i64 0 -// CHECK: [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0 -// CHECK: ret i8 [[TMP2]] +// CHECK-LABEL: define dso_local i8 @test_vsqaddb_u8( +// CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +// CHECK-NEXT: [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP2]] +// uint8_t test_vsqaddb_u8(uint8_t a, int8_t b) { return (uint8_t)vsqaddb_u8(a, b); } -// CHECK-LABEL: @test_vsqaddh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0 -// CHECK: ret i16 [[TMP2]] +// CHECK-LABEL: define dso_local i16 @test_vsqaddh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// uint16_t test_vsqaddh_u16(uint16_t a, int16_t b) { return (uint16_t)vsqaddh_u16(a, b); } -// CHECK-LABEL: @test_vsqadds_u32( -// CHECK: [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) -// CHECK: ret i32 [[VSQADDS_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vsqadds_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i32 [[VSQADDS_U32_I]] +// uint32_t test_vsqadds_u32(uint32_t a, int32_t b) { return (uint32_t)vsqadds_u32(a, b); } -// CHECK-LABEL: @test_vsqaddd_u64( -// CHECK: [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) -// CHECK: ret i64 [[VSQADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vsqaddd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 [[A]], i64 [[B]]) +// CHECK-NEXT: ret i64 [[VSQADDD_U64_I]] +// uint64_t test_vsqaddd_u64(uint64_t a, int64_t b) { return (uint64_t)vsqaddd_u64(a, b); } -// CHECK-LABEL: @test_vqdmlalh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0 -// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 -// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) -// CHECK: ret i32 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0_I]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1_I]] +// int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { return (int32_t)vqdmlalh_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlals_s32( -// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) -// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) -// CHECK: ret i64 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]]) +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL_I]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1_I]] +// int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { return (int64_t)vqdmlals_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlslh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %c, i64 0 -// CHECK: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 -// CHECK: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) -// CHECK: ret i32 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], i16 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0 +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0_I]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1_I]] +// int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) { return (int32_t)vqdmlslh_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsls_s32( -// CHECK: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) -// CHECK: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) -// CHECK: ret i64 [[VQDMLXL1_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[C]]) +// CHECK-NEXT: [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL_I]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1_I]] +// int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) { return (int64_t)vqdmlsls_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmullh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) -// CHECK: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP2]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], i16 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_s16(int16_t a, int16_t b) { return (int32_t)vqdmullh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulls_s32( -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[B]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_s32(int32_t a, int32_t b) { return (int64_t)vqdmulls_s32(a, b); } -// CHECK-LABEL: @test_vqmovunh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovunh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqmovunh_s16(int16_t a) { return (uint8_t)vqmovunh_s16(a); } -// CHECK-LABEL: @test_vqmovuns_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovuns_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqmovuns_s32(int32_t a) { return (uint16_t)vqmovuns_s32(a); } -// CHECK-LABEL: @test_vqmovund_s64( -// CHECK: [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVUND_S64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovund_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVUND_S64_I]] +// uint32_t test_vqmovund_s64(int64_t a) { return (uint32_t)vqmovund_s64(a); } -// CHECK-LABEL: @test_vqmovnh_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovnh_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqmovnh_s16(int16_t a) { return (int8_t)vqmovnh_s16(a); } -// CHECK-LABEL: @test_vqmovns_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovns_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqmovns_s32(int32_t a) { return (int16_t)vqmovns_s32(a); } -// CHECK-LABEL: @test_vqmovnd_s64( -// CHECK: [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVND_S64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovnd_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVND_S64_I]] +// int32_t test_vqmovnd_s64(int64_t a) { return (int32_t)vqmovnd_s64(a); } -// CHECK-LABEL: @test_vqmovnh_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqmovnh_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqmovnh_u16(int16_t a) { return (int8_t)vqmovnh_u16(a); } -// CHECK-LABEL: @test_vqmovns_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqmovns_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqmovns_u32(int32_t a) { return (int16_t)vqmovns_u32(a); } -// CHECK-LABEL: @test_vqmovnd_u64( -// CHECK: [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) -// CHECK: ret i32 [[VQMOVND_U64_I]] +// CHECK-LABEL: define dso_local i32 @test_vqmovnd_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 [[A]]) +// CHECK-NEXT: ret i32 [[VQMOVND_U64_I]] +// int32_t test_vqmovnd_u64(int64_t a) { return (int32_t)vqmovnd_u64(a); } -// CHECK-LABEL: @test_vceqs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vceqs_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vceqs_f32(float32_t a, float32_t b) { return (uint32_t)vceqs_f32(a, b); } -// CHECK-LABEL: @test_vceqd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vceqd_f64(float64_t a, float64_t b) { return (uint64_t)vceqd_f64(a, b); } -// CHECK-LABEL: @test_vceqzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vceqzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCEQZ_I]] +// uint32_t test_vceqzs_f32(float32_t a) { return (uint32_t)vceqzs_f32(a); } -// CHECK-LABEL: @test_vceqzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00 -// CHECK: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCEQZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vceqzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCEQZ_I]] +// uint64_t test_vceqzd_f64(float64_t a) { return (uint64_t)vceqzd_f64(a); } -// CHECK-LABEL: @test_vcges_f32( -// CHECK: [[TMP0:%.*]] = fcmp oge float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcges_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcges_f32(float32_t a, float32_t b) { return (uint32_t)vcges_f32(a, b); } -// CHECK-LABEL: @test_vcged_f64( -// CHECK: [[TMP0:%.*]] = fcmp oge double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcged_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcged_f64(float64_t a, float64_t b) { return (uint64_t)vcged_f64(a, b); } -// CHECK-LABEL: @test_vcgezs_f32( -// CHECK: [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgezs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCGEZ_I]] +// uint32_t test_vcgezs_f32(float32_t a) { return (uint32_t)vcgezs_f32(a); } -// CHECK-LABEL: @test_vcgezd_f64( -// CHECK: [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00 -// CHECK: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgezd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGEZ_I]] +// uint64_t test_vcgezd_f64(float64_t a) { return (uint64_t)vcgezd_f64(a); } -// CHECK-LABEL: @test_vcgts_f32( -// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcgts_f32(float32_t a, float32_t b) { return (uint32_t)vcgts_f32(a, b); } -// CHECK-LABEL: @test_vcgtd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcgtd_f64(float64_t a, float64_t b) { return (uint64_t)vcgtd_f64(a, b); } -// CHECK-LABEL: @test_vcgtzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcgtzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCGTZ_I]] +// uint32_t test_vcgtzs_f32(float32_t a) { return (uint32_t)vcgtzs_f32(a); } -// CHECK-LABEL: @test_vcgtzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00 -// CHECK: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCGTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcgtzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCGTZ_I]] +// uint64_t test_vcgtzd_f64(float64_t a) { return (uint64_t)vcgtzd_f64(a); } -// CHECK-LABEL: @test_vcles_f32( -// CHECK: [[TMP0:%.*]] = fcmp ole float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vcles_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vcles_f32(float32_t a, float32_t b) { return (uint32_t)vcles_f32(a, b); } -// CHECK-LABEL: @test_vcled_f64( -// CHECK: [[TMP0:%.*]] = fcmp ole double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcled_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcled_f64(float64_t a, float64_t b) { return (uint64_t)vcled_f64(a, b); } -// CHECK-LABEL: @test_vclezs_f32( -// CHECK: [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vclezs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCLEZ_I]] +// uint32_t test_vclezs_f32(float32_t a) { return (uint32_t)vclezs_f32(a); } -// CHECK-LABEL: @test_vclezd_f64( -// CHECK: [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00 -// CHECK: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLEZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vclezd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLEZ_I]] +// uint64_t test_vclezd_f64(float64_t a) { return (uint64_t)vclezd_f64(a); } -// CHECK-LABEL: @test_vclts_f32( -// CHECK: [[TMP0:%.*]] = fcmp olt float %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i32 @test_vclts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCMPD_I]] +// uint32_t test_vclts_f32(float32_t a, float32_t b) { return (uint32_t)vclts_f32(a, b); } -// CHECK-LABEL: @test_vcltd_f64( -// CHECK: [[TMP0:%.*]] = fcmp olt double %a, %b -// CHECK: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCMPD_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], [[B]] +// CHECK-NEXT: [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCMPD_I]] +// uint64_t test_vcltd_f64(float64_t a, float64_t b) { return (uint64_t)vcltd_f64(a, b); } -// CHECK-LABEL: @test_vcltzs_f32( -// CHECK: [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 -// CHECK: ret i32 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i32 @test_vcltzs_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt float [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[VCLTZ_I]] +// uint32_t test_vcltzs_f32(float32_t a) { return (uint32_t)vcltzs_f32(a); } -// CHECK-LABEL: @test_vcltzd_f64( -// CHECK: [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00 -// CHECK: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 -// CHECK: ret i64 [[VCLTZ_I]] +// CHECK-LABEL: define dso_local i64 @test_vcltzd_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt double [[A]], 0.000000e+00 +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[VCLTZ_I]] +// uint64_t test_vcltzd_f64(float64_t a) { return (uint64_t)vcltzd_f64(a); } -// CHECK-LABEL: @test_vcages_f32( -// CHECK: [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) -// CHECK: ret i32 [[VCAGES_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcages_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret i32 [[VCAGES_F32_I]] +// uint32_t test_vcages_f32(float32_t a, float32_t b) { return (uint32_t)vcages_f32(a, b); } -// CHECK-LABEL: @test_vcaged_f64( -// CHECK: [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) -// CHECK: ret i64 [[VCAGED_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaged_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret i64 [[VCAGED_F64_I]] +// uint64_t test_vcaged_f64(float64_t a, float64_t b) { return (uint64_t)vcaged_f64(a, b); } -// CHECK-LABEL: @test_vcagts_f32( -// CHECK: [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) -// CHECK: ret i32 [[VCAGTS_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcagts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret i32 [[VCAGTS_F32_I]] +// uint32_t test_vcagts_f32(float32_t a, float32_t b) { return (uint32_t)vcagts_f32(a, b); } -// CHECK-LABEL: @test_vcagtd_f64( -// CHECK: [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) -// CHECK: ret i64 [[VCAGTD_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcagtd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret i64 [[VCAGTD_F64_I]] +// uint64_t test_vcagtd_f64(float64_t a, float64_t b) { return (uint64_t)vcagtd_f64(a, b); } -// CHECK-LABEL: @test_vcales_f32( -// CHECK: [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) -// CHECK: ret i32 [[VCALES_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcales_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float [[B]], float [[A]]) +// CHECK-NEXT: ret i32 [[VCALES_F32_I]] +// uint32_t test_vcales_f32(float32_t a, float32_t b) { return (uint32_t)vcales_f32(a, b); } -// CHECK-LABEL: @test_vcaled_f64( -// CHECK: [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) -// CHECK: ret i64 [[VCALED_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaled_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double [[B]], double [[A]]) +// CHECK-NEXT: ret i64 [[VCALED_F64_I]] +// uint64_t test_vcaled_f64(float64_t a, float64_t b) { return (uint64_t)vcaled_f64(a, b); } -// CHECK-LABEL: @test_vcalts_f32( -// CHECK: [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) -// CHECK: ret i32 [[VCALTS_F32_I]] +// CHECK-LABEL: define dso_local i32 @test_vcalts_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float [[B]], float [[A]]) +// CHECK-NEXT: ret i32 [[VCALTS_F32_I]] +// uint32_t test_vcalts_f32(float32_t a, float32_t b) { return (uint32_t)vcalts_f32(a, b); } -// CHECK-LABEL: @test_vcaltd_f64( -// CHECK: [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) -// CHECK: ret i64 [[VCALTD_F64_I]] +// CHECK-LABEL: define dso_local i64 @test_vcaltd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double [[B]], double [[A]]) +// CHECK-NEXT: ret i64 [[VCALTD_F64_I]] +// uint64_t test_vcaltd_f64(float64_t a, float64_t b) { return (uint64_t)vcaltd_f64(a, b); } -// CHECK-LABEL: @test_vshrd_n_s64( -// CHECK: [[SHRD_N:%.*]] = ashr i64 %a, 1 -// CHECK: ret i64 [[SHRD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = ashr i64 [[A]], 1 +// CHECK-NEXT: ret i64 [[SHRD_N]] +// int64_t test_vshrd_n_s64(int64_t a) { return (int64_t)vshrd_n_s64(a, 1); } -// CHECK-LABEL: @test_vshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// int64x1_t test_vshr_n_s64(int64x1_t a) { return vshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrd_n_u64( -// CHECK: ret i64 0 +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 0 +// uint64_t test_vshrd_n_u64(uint64_t a) { return (uint64_t)vshrd_n_u64(a, 64); } -// CHECK-LABEL: @test_vshrd_n_u64_2( -// CHECK: ret i64 0 +// CHECK-LABEL: define dso_local i64 @test_vshrd_n_u64_2( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 0 +// uint64_t test_vshrd_n_u64_2() { uint64_t a = UINT64_C(0xf000000000000000); return vshrd_n_u64(a, 64); } -// CHECK-LABEL: @test_vshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// uint64x1_t test_vshr_n_u64(uint64x1_t a) { return vshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshrd_n_s64( -// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63) -// CHECK: ret i64 [[VRSHR_N]] +// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[A]], i64 -63) +// CHECK-NEXT: ret i64 [[VRSHR_N]] +// int64_t test_vrshrd_n_s64(int64_t a) { return (int64_t)vrshrd_n_s64(a, 63); } -// CHECK-LABEL: @test_vrshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// int64x1_t test_vrshr_n_s64(int64x1_t a) { return vrshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrd_n_u64( -// CHECK: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63) -// CHECK: ret i64 [[VRSHR_N]] +// CHECK-LABEL: define dso_local i64 @test_vrshrd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[A]], i64 -63) +// CHECK-NEXT: ret i64 [[VRSHR_N]] +// uint64_t test_vrshrd_n_u64(uint64_t a) { return (uint64_t)vrshrd_n_u64(a, 63); } -// CHECK-LABEL: @test_vrshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// uint64x1_t test_vrshr_n_u64(uint64x1_t a) { return vrshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vsrad_n_s64( -// CHECK: [[SHRD_N:%.*]] = ashr i64 %b, 63 -// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] -// CHECK: ret i64 [[TMP0]] +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = ashr i64 [[B]], 63 +// CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]] +// CHECK-NEXT: ret i64 [[TMP0]] +// int64_t test_vsrad_n_s64(int64_t a, int64_t b) { return (int64_t)vsrad_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <1 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsrad_n_u64( -// CHECK: [[SHRD_N:%.*]] = lshr i64 %b, 63 -// CHECK: [[TMP0:%.*]] = add i64 %a, [[SHRD_N]] -// CHECK: ret i64 [[TMP0]] +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHRD_N:%.*]] = lshr i64 [[B]], 63 +// CHECK-NEXT: [[TMP0:%.*]] = add i64 [[A]], [[SHRD_N]] +// CHECK-NEXT: ret i64 [[TMP0]] +// uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vsrad_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsrad_n_u64_2( -// CHECK: ret i64 %a +// CHECK-LABEL: define dso_local i64 @test_vsrad_n_u64_2( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret i64 [[A]] +// uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) { return (uint64_t)vsrad_n_u64(a, b, 64); } -// CHECK-LABEL: @test_vsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <1 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsrad_n_s64( -// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63) -// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] -// CHECK: ret i64 [[TMP1]] +// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[B]], i64 -63) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A]], [[TMP0]] +// CHECK-NEXT: ret i64 [[TMP1]] +// int64_t test_vrsrad_n_s64(int64_t a, int64_t b) { return (int64_t)vrsrad_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vrsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsrad_n_u64( -// CHECK: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63) -// CHECK: [[TMP1:%.*]] = add i64 %a, [[TMP0]] -// CHECK: ret i64 [[TMP1]] +// CHECK-LABEL: define dso_local i64 @test_vrsrad_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[B]], i64 -63) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[A]], [[TMP0]] +// CHECK-NEXT: ret i64 [[TMP1]] +// uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vrsrad_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vrsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]] -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vrsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VRSHR_N1]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vshld_n_s64( -// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 1 -// CHECK: ret i64 [[SHLD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshld_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHLD_N:%.*]] = shl i64 [[A]], 1 +// CHECK-NEXT: ret i64 [[SHLD_N]] +// int64_t test_vshld_n_s64(int64_t a) { return (int64_t)vshld_n_s64(a, 1); } -// CHECK-LABEL: @test_vshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// int64x1_t test_vshl_n_s64(int64x1_t a) { return vshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vshld_n_u64( -// CHECK: [[SHLD_N:%.*]] = shl i64 %a, 63 -// CHECK: ret i64 [[SHLD_N]] +// CHECK-LABEL: define dso_local i64 @test_vshld_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHLD_N:%.*]] = shl i64 [[A]], 63 +// CHECK-NEXT: ret i64 [[SHLD_N]] +// uint64_t test_vshld_n_u64(uint64_t a) { return (uint64_t)vshld_n_u64(a, 63); } -// CHECK-LABEL: @test_vshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlb_n_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshlb_n_s8(int8_t a) { return (int8_t)vqshlb_n_s8(a, 7); } -// CHECK-LABEL: @test_vqshlh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshlh_n_s16(int16_t a) { return (int16_t)vqshlh_n_s16(a, 15); } -// CHECK-LABEL: @test_vqshls_n_s32( -// CHECK: [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLS_N_S32]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLS_N_S32]] +// int32_t test_vqshls_n_s32(int32_t a) { return (int32_t)vqshls_n_s32(a, 31); } -// CHECK-LABEL: @test_vqshld_n_s64( -// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHL_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHL_N]] +// int64_t test_vqshld_n_s64(int64_t a) { return (int64_t)vqshld_n_s64(a, 63); } -// CHECK-LABEL: @test_vqshl_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// int8x8_t test_vqshl_n_s8(int8x8_t a) { return vqshl_n_s8(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// int8x16_t test_vqshlq_n_s8(int8x16_t a) { return vqshlq_n_s8(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[A]], <4 x i16> zeroinitializer) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// int16x4_t test_vqshl_n_s16(int16x4_t a) { return vqshl_n_s16(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[A]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// int16x8_t test_vqshlq_n_s16(int16x8_t a) { return vqshlq_n_s16(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[A]], <2 x i32> zeroinitializer) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// int32x2_t test_vqshl_n_s32(int32x2_t a) { return vqshl_n_s32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[A]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// int32x4_t test_vqshlq_n_s32(int32x4_t a) { return vqshlq_n_s32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[A]], <2 x i64> zeroinitializer) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// int64x2_t test_vqshlq_n_s64(int64x2_t a) { return vqshlq_n_s64(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[A]], <8 x i8> zeroinitializer) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// uint8x8_t test_vqshl_n_u8(uint8x8_t a) { return vqshl_n_u8(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[A]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { return vqshlq_n_u8(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[A]], <4 x i16> zeroinitializer) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// uint16x4_t test_vqshl_n_u16(uint16x4_t a) { return vqshl_n_u16(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[A]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { return vqshlq_n_u16(a, 0); } -// CHECK-LABEL: @test_vqshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[A]], <2 x i32> zeroinitializer) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// uint32x2_t test_vqshl_n_u32(uint32x2_t a) { return vqshl_n_u32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[A]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { return vqshlq_n_u32(a, 0); } -// CHECK-LABEL: @test_vqshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[A]], <2 x i64> zeroinitializer) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { return vqshlq_n_u64(a, 0); } -// CHECK-LABEL: @test_vqshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// int64x1_t test_vqshl_n_s64(int64x1_t a) { return vqshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshlb_n_u8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlb_n_u8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqshlb_n_u8(uint8_t a) { return (uint8_t)vqshlb_n_u8(a, 7); } -// CHECK-LABEL: @test_vqshlh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshlh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqshlh_n_u16(uint16_t a) { return (uint16_t)vqshlh_n_u16(a, 15); } -// CHECK-LABEL: @test_vqshls_n_u32( -// CHECK: [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLS_N_U32]] +// CHECK-LABEL: define dso_local i32 @test_vqshls_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLS_N_U32]] +// uint32_t test_vqshls_n_u32(uint32_t a) { return (uint32_t)vqshls_n_u32(a, 31); } -// CHECK-LABEL: @test_vqshld_n_u64( -// CHECK: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHL_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshld_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHL_N]] +// uint64_t test_vqshld_n_u64(uint64_t a) { return (uint64_t)vqshld_n_u64(a, 63); } -// CHECK-LABEL: @test_vqshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// uint64x1_t test_vqshl_n_u64(uint64x1_t a) { return vqshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlub_n_s8( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 %a, i64 0 -// CHECK: [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshlub_n_s8( +// CHECK-SAME: i8 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshlub_n_s8(int8_t a) { return (int8_t)vqshlub_n_s8(a, 7); } -// CHECK-LABEL: @test_vqshluh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshluh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> ) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshluh_n_s16(int16_t a) { return (int16_t)vqshluh_n_s16(a, 15); } -// CHECK-LABEL: @test_vqshlus_n_s32( -// CHECK: [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31) -// CHECK: ret i32 [[VQSHLUS_N_S32]] +// CHECK-LABEL: define dso_local i32 @test_vqshlus_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[A]], i32 31) +// CHECK-NEXT: ret i32 [[VQSHLUS_N_S32]] +// int32_t test_vqshlus_n_s32(int32_t a) { return (int32_t)vqshlus_n_s32(a, 31); } -// CHECK-LABEL: @test_vqshlud_n_s64( -// CHECK: [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63) -// CHECK: ret i64 [[VQSHLU_N]] +// CHECK-LABEL: define dso_local i64 @test_vqshlud_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[A]], i64 63) +// CHECK-NEXT: ret i64 [[VQSHLU_N]] +// int64_t test_vqshlud_n_s64(int64_t a) { return (int64_t)vqshlud_n_s64(a, 63); } -// CHECK-LABEL: @test_vqshlu_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqshlu_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHLU_N1]] +// uint64x1_t test_vqshlu_n_s64(int64x1_t a) { return vqshlu_n_s64(a, 1); } -// CHECK-LABEL: @test_vsrid_n_s64( -// CHECK: [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63) -// CHECK: [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64 -// CHECK: ret i64 [[VSRID_N_S643]] +// CHECK-LABEL: define dso_local i64 @test_vsrid_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRID_N_S64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VSRID_N_S641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0 +// CHECK-NEXT: [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[VSRID_N_S642]], i64 0 +// CHECK-NEXT: ret i64 [[TMP0]] +// int64_t test_vsrid_n_s64(int64_t a, int64_t b) { return (int64_t)vsrid_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsri_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsrid_n_u64( -// CHECK: [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63) -// CHECK: [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64 -// CHECK: ret i64 [[VSRID_N_U643]] +// CHECK-LABEL: define dso_local i64 @test_vsrid_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRID_N_U64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VSRID_N_U641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0 +// CHECK-NEXT: [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[VSRID_N_U642]], i64 0 +// CHECK-NEXT: ret i64 [[TMP0]] +// uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vsrid_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsri_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vslid_n_s64( -// CHECK: [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63) -// CHECK: [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64 -// CHECK: ret i64 [[VSLID_N_S643]] +// CHECK-LABEL: define dso_local i64 @test_vslid_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLID_N_S64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VSLID_N_S641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0 +// CHECK-NEXT: [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[VSLID_N_S642]], i64 0 +// CHECK-NEXT: ret i64 [[TMP0]] +// int64_t test_vslid_n_s64(int64_t a, int64_t b) { return (int64_t)vslid_n_s64(a, b, 63); } -// CHECK-LABEL: @test_vsli_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vslid_n_u64( -// CHECK: [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64> -// CHECK: [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63) -// CHECK: [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64 -// CHECK: ret i64 [[VSLID_N_U643]] +// CHECK-LABEL: define dso_local i64 @test_vslid_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLID_N_U64:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VSLID_N_U641:%.*]] = insertelement <1 x i64> poison, i64 [[B]], i64 0 +// CHECK-NEXT: [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[VSLID_N_U642]], i64 0 +// CHECK-NEXT: ret i64 [[TMP0]] +// uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) { return (uint64_t)vslid_n_u64(a, b, 63); } -// CHECK-LABEL: @test_vsli_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsli_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 1) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vqshrnh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshrnh_n_s16(int16_t a) { return (int8_t)vqshrnh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqshrns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshrns_n_s32(int32_t a) { return (int16_t)vqshrns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqshrnd_n_s64( -// CHECK: [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRND_N_S64]] +// int32_t test_vqshrnd_n_s64(int64_t a) { return (int32_t)vqshrnd_n_s64(a, 32); } -// CHECK-LABEL: @test_vqshrnh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrnh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqshrnh_n_u16(uint16_t a) { return (uint8_t)vqshrnh_n_u16(a, 8); } -// CHECK-LABEL: @test_vqshrns_n_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshrns_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqshrns_n_u32(uint32_t a) { return (uint16_t)vqshrns_n_u32(a, 16); } -// CHECK-LABEL: @test_vqshrnd_n_u64( -// CHECK: [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRND_N_U64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrnd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRND_N_U64]] +// uint32_t test_vqshrnd_n_u64(uint64_t a) { return (uint32_t)vqshrnd_n_u64(a, 32); } -// CHECK-LABEL: @test_vqrshrnh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqrshrnh_n_s16(int16_t a) { return (int8_t)vqrshrnh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqrshrns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqrshrns_n_s32(int32_t a) { return (int16_t)vqrshrns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqrshrnd_n_s64( -// CHECK: [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRND_N_S64]] +// int32_t test_vqrshrnd_n_s64(int64_t a) { return (int32_t)vqrshrnd_n_s64(a, 32); } -// CHECK-LABEL: @test_vqrshrnh_n_u16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrnh_n_u16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqrshrnh_n_u16(uint16_t a) { return (uint8_t)vqrshrnh_n_u16(a, 8); } -// CHECK-LABEL: @test_vqrshrns_n_u32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshrns_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqrshrns_n_u32(uint32_t a) { return (uint16_t)vqrshrns_n_u32(a, 16); } -// CHECK-LABEL: @test_vqrshrnd_n_u64( -// CHECK: [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRND_N_U64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrnd_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRND_N_U64]] +// uint32_t test_vqrshrnd_n_u64(uint64_t a) { return (uint32_t)vqrshrnd_n_u64(a, 32); } -// CHECK-LABEL: @test_vqshrunh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqshrunh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// int8_t test_vqshrunh_n_s16(int16_t a) { return (int8_t)vqshrunh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqshruns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqshruns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// int16_t test_vqshruns_n_s32(int32_t a) { return (int16_t)vqshruns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqshrund_n_s64( -// CHECK: [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQSHRUND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqshrund_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQSHRUND_N_S64]] +// int32_t test_vqshrund_n_s64(int64_t a) { return (int32_t)vqshrund_n_s64(a, 32); } -// CHECK-LABEL: @test_vqrshrunh_n_s16( -// CHECK: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 %a, i64 0 -// CHECK: [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8) -// CHECK: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0 -// CHECK: ret i8 [[TMP1]] +// CHECK-LABEL: define dso_local i8 @test_vqrshrunh_n_s16( +// CHECK-SAME: i16 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0 +// CHECK-NEXT: ret i8 [[TMP1]] +// uint8_t test_vqrshrunh_n_s16(int16_t a) { return (uint8_t)vqrshrunh_n_s16(a, 8); } -// CHECK-LABEL: @test_vqrshruns_n_s32( -// CHECK: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0 -// CHECK: [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16) -// CHECK: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0 -// CHECK: ret i16 [[TMP1]] +// CHECK-LABEL: define dso_local i16 @test_vqrshruns_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16) +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0 +// CHECK-NEXT: ret i16 [[TMP1]] +// uint16_t test_vqrshruns_n_s32(int32_t a) { return (uint16_t)vqrshruns_n_s32(a, 16); } -// CHECK-LABEL: @test_vqrshrund_n_s64( -// CHECK: [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32) -// CHECK: ret i32 [[VQRSHRUND_N_S64]] +// CHECK-LABEL: define dso_local i32 @test_vqrshrund_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VQRSHRUND_N_S64]] +// uint32_t test_vqrshrund_n_s64(int64_t a) { return (uint32_t)vqrshrund_n_s64(a, 32); } -// CHECK-LABEL: @test_vcvts_n_f32_s32( -// CHECK: [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1) -// CHECK: ret float [[VCVTS_N_F32_S32]] +// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 [[A]], i32 1) +// CHECK-NEXT: ret float [[VCVTS_N_F32_S32]] +// float32_t test_vcvts_n_f32_s32(int32_t a) { return vcvts_n_f32_s32(a, 1); } -// CHECK-LABEL: @test_vcvtd_n_f64_s64( -// CHECK: [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1) -// CHECK: ret double [[VCVTD_N_F64_S64]] +// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 [[A]], i32 1) +// CHECK-NEXT: ret double [[VCVTD_N_F64_S64]] +// float64_t test_vcvtd_n_f64_s64(int64_t a) { return vcvtd_n_f64_s64(a, 1); } -// CHECK-LABEL: @test_vcvts_n_f32_u32( -// CHECK: [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32) -// CHECK: ret float [[VCVTS_N_F32_U32]] +// CHECK-LABEL: define dso_local float @test_vcvts_n_f32_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 [[A]], i32 32) +// CHECK-NEXT: ret float [[VCVTS_N_F32_U32]] +// float32_t test_vcvts_n_f32_u32(uint32_t a) { return vcvts_n_f32_u32(a, 32); } -// CHECK-LABEL: @test_vcvtd_n_f64_u64( -// CHECK: [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64) -// CHECK: ret double [[VCVTD_N_F64_U64]] +// CHECK-LABEL: define dso_local double @test_vcvtd_n_f64_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 [[A]], i32 64) +// CHECK-NEXT: ret double [[VCVTD_N_F64_U64]] +// float64_t test_vcvtd_n_f64_u64(uint64_t a) { return vcvtd_n_f64_u64(a, 64); } -// CHECK-LABEL: @test_vcvts_n_s32_f32( -// CHECK: [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1) -// CHECK: ret i32 [[VCVTS_N_S32_F32]] +// CHECK-LABEL: define dso_local i32 @test_vcvts_n_s32_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float [[A]], i32 1) +// CHECK-NEXT: ret i32 [[VCVTS_N_S32_F32]] +// int32_t test_vcvts_n_s32_f32(float32_t a) { return (int32_t)vcvts_n_s32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtd_n_s64_f64( -// CHECK: [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1) -// CHECK: ret i64 [[VCVTD_N_S64_F64]] +// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_s64_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double [[A]], i32 1) +// CHECK-NEXT: ret i64 [[VCVTD_N_S64_F64]] +// int64_t test_vcvtd_n_s64_f64(float64_t a) { return (int64_t)vcvtd_n_s64_f64(a, 1); } -// CHECK-LABEL: @test_vcvts_n_u32_f32( -// CHECK: [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32) -// CHECK: ret i32 [[VCVTS_N_U32_F32]] +// CHECK-LABEL: define dso_local i32 @test_vcvts_n_u32_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float [[A]], i32 32) +// CHECK-NEXT: ret i32 [[VCVTS_N_U32_F32]] +// uint32_t test_vcvts_n_u32_f32(float32_t a) { return (uint32_t)vcvts_n_u32_f32(a, 32); } -// CHECK-LABEL: @test_vcvtd_n_u64_f64( -// CHECK: [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64) -// CHECK: ret i64 [[VCVTD_N_U64_F64]] +// CHECK-LABEL: define dso_local i64 @test_vcvtd_n_u64_f64( +// CHECK-SAME: double noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double [[A]], i32 64) +// CHECK-NEXT: ret i64 [[VCVTD_N_U64_F64]] +// uint64_t test_vcvtd_n_u64_f64(float64_t a) { return (uint64_t)vcvtd_n_u64_f64(a, 64); } -// CHECK-LABEL: @test_vreinterpret_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_f64(float64x1_t a) { return vreinterpret_s8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) { return vreinterpret_s8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f64(float64x1_t a) { return vreinterpret_s16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) { return vreinterpret_s16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f64(float64x1_t a) { return vreinterpret_s32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) { return vreinterpret_s32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_f64(float64x1_t a) { return vreinterpret_s64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) { return vreinterpret_s64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) { return vreinterpret_u8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) { return vreinterpret_u8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) { return vreinterpret_u16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) { return vreinterpret_u16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) { return vreinterpret_u32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) { return vreinterpret_u32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) { return vreinterpret_u64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) { return vreinterpret_u64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_f64(float64x1_t a) { return vreinterpret_f16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) { return vreinterpret_f16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_f64(float64x1_t a) { return vreinterpret_f32_f64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) { return vreinterpret_f32_p64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_s8(int8x8_t a) { return vreinterpret_f64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_s16(int16x4_t a) { return vreinterpret_f64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_s32(int32x2_t a) { return vreinterpret_f64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_s64(int64x1_t a) { return vreinterpret_f64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) { return vreinterpret_f64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) { return vreinterpret_f64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) { return vreinterpret_f64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) { return vreinterpret_f64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_f16(float16x4_t a) { return vreinterpret_f64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_f32(float32x2_t a) { return vreinterpret_f64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) { return vreinterpret_f64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) { return vreinterpret_f64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f64_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) { return vreinterpret_f64_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) { return vreinterpret_p8_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) { return vreinterpret_p8_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) { return vreinterpret_p16_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) { return vreinterpret_p16_p64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) { return vreinterpret_p64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) { return vreinterpret_p64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) { return vreinterpret_p64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) { return vreinterpret_p64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) { return vreinterpret_p64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) { return vreinterpret_p64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) { return vreinterpret_p64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) { return vreinterpret_p64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) { return vreinterpret_p64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) { return vreinterpret_p64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) { return vreinterpret_p64_f64(a); } -// CHECK-LABEL: @test_vreinterpret_p64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) { return vreinterpret_p64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_p64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) { return vreinterpret_p64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) { return vreinterpretq_s8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) { return vreinterpretq_s8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) { return vreinterpretq_s16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) { return vreinterpretq_s16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) { return vreinterpretq_s32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) { return vreinterpretq_s32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) { return vreinterpretq_s64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) { return vreinterpretq_s64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) { return vreinterpretq_u8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) { return vreinterpretq_u8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) { return vreinterpretq_u16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) { return vreinterpretq_u16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) { return vreinterpretq_u32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) { return vreinterpretq_u32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) { return vreinterpretq_u64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) { return vreinterpretq_u64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) { return vreinterpretq_f16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) { return vreinterpretq_f16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) { return vreinterpretq_f32_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) { return vreinterpretq_f32_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) { return vreinterpretq_f64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) { return vreinterpretq_f64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) { return vreinterpretq_f64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) { return vreinterpretq_f64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) { return vreinterpretq_f64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) { return vreinterpretq_f64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) { return vreinterpretq_f64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) { return vreinterpretq_f64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) { return vreinterpretq_f64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) { return vreinterpretq_f64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) { return vreinterpretq_f64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) { return vreinterpretq_f64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f64_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[TMP0]] +// float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) { return vreinterpretq_f64_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) { return vreinterpretq_p8_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) { return vreinterpretq_p8_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) { return vreinterpretq_p16_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) { return vreinterpretq_p16_p64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) { return vreinterpretq_p64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) { return vreinterpretq_p64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) { return vreinterpretq_p64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) { return vreinterpretq_p64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) { return vreinterpretq_p64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) { return vreinterpretq_p64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) { return vreinterpretq_p64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) { return vreinterpretq_p64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) { return vreinterpretq_p64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) { return vreinterpretq_p64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) { return vreinterpretq_p64_f64(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) { return vreinterpretq_p64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_p64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) { return vreinterpretq_p64_p16(a); } -// CHECK-LABEL: @test_vabds_f32( -// CHECK: [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) -// CHECK: ret float [[VABDS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vabds_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float [[A]], float [[B]]) +// CHECK-NEXT: ret float [[VABDS_F32_I]] +// float32_t test_vabds_f32(float32_t a, float32_t b) { return vabds_f32(a, b); } -// CHECK-LABEL: @test_vabdd_f64( -// CHECK: [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) -// CHECK: ret double [[VABDD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vabdd_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double [[A]], double [[B]]) +// CHECK-NEXT: ret double [[VABDD_F64_I]] +// float64_t test_vabdd_f64(float64_t a, float64_t b) { return vabdd_f64(a, b); } -// CHECK-LABEL: @test_vuqaddq_s8( -// CHECK: entry: -// CHECK-NEXT: [[V:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK-NEXT: ret <16 x i8> [[V]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VUQADD_I]] +// int8x16_t test_vuqaddq_s8(int8x16_t a, uint8x16_t b) { return vuqaddq_s8(a, b); } -// CHECK-LABEL: @test_vuqaddq_s32( -// CHECK: [[V:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK-NEXT: ret <4 x i32> [[V]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUQADD2_I]] +// int32x4_t test_vuqaddq_s32(int32x4_t a, uint32x4_t b) { return vuqaddq_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s64( -// CHECK: [[V:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK-NEXT: ret <2 x i64> [[V]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VUQADD2_I]] +// int64x2_t test_vuqaddq_s64(int64x2_t a, uint64x2_t b) { return vuqaddq_s64(a, b); } -// CHECK-LABEL: @test_vuqaddq_s16( -// CHECK: [[V:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK-NEXT: ret <8 x i16> [[V]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VUQADD2_I]] +// int16x8_t test_vuqaddq_s16(int16x8_t a, uint16x8_t b) { return vuqaddq_s16(a, b); } -// CHECK-LABEL: @test_vuqadd_s8( -// CHECK: entry: -// CHECK-NEXT: [[V:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK-NEXT: ret <8 x i8> [[V]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VUQADD_I]] +// int8x8_t test_vuqadd_s8(int8x8_t a, uint8x8_t b) { return vuqadd_s8(a, b); } -// CHECK-LABEL: @test_vuqadd_s32( -// CHECK: [[V:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK-NEXT: ret <2 x i32> [[V]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUQADD2_I]] +// int32x2_t test_vuqadd_s32(int32x2_t a, uint32x2_t b) { return vuqadd_s32(a, b); } -// CHECK-LABEL: @test_vuqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: ret <1 x i64> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vuqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VUQADD2_I]] +// int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) { return vuqadd_s64(a, b); } -// CHECK-LABEL: @test_vuqadd_s16( -// CHECK: [[V:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK-NEXT: ret <4 x i16> [[V]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VUQADD2_I]] +// int16x4_t test_vuqadd_s16(int16x4_t a, uint16x4_t b) { return vuqadd_s16(a, b); } -// CHECK-LABEL: @test_vsqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: ret <1 x i64> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VSQADD2_I]] +// uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) { return vsqadd_u64(a, b); } -// CHECK-LABEL: @test_vsqadd_u8( -// CHECK: [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSQADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vsqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSQADD_I]] +// uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) { return vsqadd_u8(a, b); } -// CHECK-LABEL: @test_vsqaddq_u8( -// CHECK: [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSQADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vsqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSQADD_I]] +// uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) { return vsqaddq_u8(a, b); } -// CHECK-LABEL: @test_vsqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vsqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VSQADD2_I]] +// uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) { return vsqadd_u16(a, b); } -// CHECK-LABEL: @test_vsqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vsqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VSQADD2_I]] +// uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) { return vsqaddq_u16(a, b); } -// CHECK-LABEL: @test_vsqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VSQADD2_I]] +// uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) { return vsqadd_u32(a, b); } -// CHECK-LABEL: @test_vsqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VSQADD2_I]] +// uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) { return vsqaddq_u32(a, b); } -// CHECK-LABEL: @test_vsqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: ret <2 x i64> [[VSQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VSQADD2_I]] +// uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) { return vsqaddq_u64(a, b); } -// CHECK-LABEL: @test_vabs_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %a) -// CHECK: ret <1 x i64> [[VABS1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vabs_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VABS1_I]] +// int64x1_t test_vabs_s64(int64x1_t a) { return vabs_s64(a); } -// CHECK-LABEL: @test_vqabs_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqabs_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VQABS_V1_I]] +// int64x1_t test_vqabs_s64(int64x1_t a) { return vqabs_s64(a); } -// CHECK-LABEL: @test_vqneg_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vqneg_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VQNEG_V1_I]] +// int64x1_t test_vqneg_s64(int64x1_t a) { return vqneg_s64(a); } -// CHECK-LABEL: @test_vneg_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vneg_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, [[A]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vneg_s64(int64x1_t a) { return vneg_s64(a); } -// CHECK-LABEL: @test_vaddv_f32( -// CHECK: [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VADDV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vaddv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VADDV_F32_I]] +// float32_t test_vaddv_f32(float32x2_t a) { return vaddv_f32(a); } -// CHECK-LABEL: @test_vaddvq_f32( -// CHECK: [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a) -// CHECK: ret float [[VADDVQ_F32_I]] +// CHECK-LABEL: define dso_local float @test_vaddvq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret float [[VADDVQ_F32_I]] +// float32_t test_vaddvq_f32(float32x4_t a) { return vaddvq_f32(a); } -// CHECK-LABEL: @test_vaddvq_f64( -// CHECK: [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VADDVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vaddvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VADDVQ_F64_I]] +// float64_t test_vaddvq_f64(float64x2_t a) { return vaddvq_f64(a); } -// CHECK-LABEL: @test_vmaxv_f32( -// CHECK: [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMAXV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmaxv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMAXV_F32_I]] +// float32_t test_vmaxv_f32(float32x2_t a) { return vmaxv_f32(a); } -// CHECK-LABEL: @test_vmaxvq_f64( -// CHECK: [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMAXVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmaxvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMAXVQ_F64_I]] +// float64_t test_vmaxvq_f64(float64x2_t a) { return vmaxvq_f64(a); } -// CHECK-LABEL: @test_vminv_f32( -// CHECK: [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMINV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vminv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMINV_F32_I]] +// float32_t test_vminv_f32(float32x2_t a) { return vminv_f32(a); } -// CHECK-LABEL: @test_vminvq_f64( -// CHECK: [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMINVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vminvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMINVQ_F64_I]] +// float64_t test_vminvq_f64(float64x2_t a) { return vminvq_f64(a); } -// CHECK-LABEL: @test_vmaxnmvq_f64( -// CHECK: [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMAXNMVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmaxnmvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMAXNMVQ_F64_I]] +// float64_t test_vmaxnmvq_f64(float64x2_t a) { return vmaxnmvq_f64(a); } -// CHECK-LABEL: @test_vmaxnmv_f32( -// CHECK: [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMAXNMV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmaxnmv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMAXNMV_F32_I]] +// float32_t test_vmaxnmv_f32(float32x2_t a) { return vmaxnmv_f32(a); } -// CHECK-LABEL: @test_vminnmvq_f64( -// CHECK: [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %a) -// CHECK: ret double [[VMINNMVQ_F64_I]] +// CHECK-LABEL: define dso_local double @test_vminnmvq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret double [[VMINNMVQ_F64_I]] +// float64_t test_vminnmvq_f64(float64x2_t a) { return vminnmvq_f64(a); } -// CHECK-LABEL: @test_vminnmv_f32( -// CHECK: [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %a) -// CHECK: ret float [[VMINNMV_F32_I]] +// CHECK-LABEL: define dso_local float @test_vminnmv_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret float [[VMINNMV_F32_I]] +// float32_t test_vminnmv_f32(float32x2_t a) { return vminnmv_f32(a); } -// CHECK-LABEL: @test_vpaddq_s64( -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDQ_V2_I]] +// int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) { return vpaddq_s64(a, b); } -// CHECK-LABEL: @test_vpaddq_u64( -// CHECK: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VPADDQ_V2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDQ_V2_I]] +// uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) { return vpaddq_u64(a, b); } -// CHECK-LABEL: @test_vpaddd_u64( -// CHECK: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VPADDD_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vpaddd_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VPADDD_U64_I]] +// uint64_t test_vpaddd_u64(uint64x2_t a) { return vpaddd_u64(a); } -// CHECK-LABEL: @test_vaddvq_s64( -// CHECK: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VADDVQ_S64_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddvq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VADDVQ_S64_I]] +// int64_t test_vaddvq_s64(int64x2_t a) { return vaddvq_s64(a); } -// CHECK-LABEL: @test_vaddvq_u64( -// CHECK: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a) -// CHECK: ret i64 [[VADDVQ_U64_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddvq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret i64 [[VADDVQ_U64_I]] +// uint64_t test_vaddvq_u64(uint64x2_t a) { return vaddvq_u64(a); } -// CHECK-LABEL: @test_vadd_f64( -// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, %b -// CHECK: ret <1 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vadd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) { return vadd_f64(a, b); } -// CHECK-LABEL: @test_vmul_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %a, %b -// CHECK: ret <1 x double> [[MUL_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmul_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[MUL_I]] +// float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) { return vmul_f64(a, b); } -// CHECK-LABEL: @test_vdiv_f64( -// CHECK: [[DIV_I:%.*]] = fdiv <1 x double> %a, %b -// CHECK: ret <1 x double> [[DIV_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vdiv_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DIV_I:%.*]] = fdiv <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[DIV_I]] +// float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) { return vdiv_f64(a, b); } -// CHECK-LABEL: @test_vmla_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]] -// CHECK: ret <1 x double> [[ADD_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmla_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <1 x double> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <1 x double> [[ADD_I]] +// float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmla_f64(a, b, c); } -// CHECK-LABEL: @test_vmls_f64( -// CHECK: [[MUL_I:%.*]] = fmul <1 x double> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]] -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmls_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <1 x double> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vmls_f64(a, b, c); } -// CHECK-LABEL: @test_vfma_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %c, <1 x double> %a) -// CHECK: ret <1 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[C]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfma_f64(a, b, c); } -// CHECK-LABEL: @test_vfms_f64( -// CHECK: [[SUB_I:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[SUB_I]], <1 x double> %c, <1 x double> %a) -// CHECK: ret <1 x double> [[TMP3]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[C]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[TMP0]] +// float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) { return vfms_f64(a, b, c); } -// CHECK-LABEL: @test_vsub_f64( -// CHECK: [[SUB_I:%.*]] = fsub <1 x double> %a, %b -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vsub_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <1 x double> [[A]], [[B]] +// CHECK-NEXT: ret <1 x double> [[SUB_I]] +// float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) { return vsub_f64(a, b); } -// CHECK-LABEL: @test_vabd_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VABD2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vabd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VABD2_I]] +// float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) { return vabd_f64(a, b); } -// CHECK-LABEL: @test_vmax_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMAX2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmax_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VMAX2_I]] +// float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) { return vmax_f64(a, b); } -// CHECK-LABEL: @test_vmin_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMIN2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmin_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VMIN2_I]] +// float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) { return vmin_f64(a, b); } -// CHECK-LABEL: @test_vmaxnm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMAXNM2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmaxnm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VMAXNM2_I]] +// float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) { return vmaxnm_f64(a, b); } -// CHECK-LABEL: @test_vminnm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VMINNM2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vminnm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VMINNM2_I]] +// float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) { return vminnm_f64(a, b); } -// CHECK-LABEL: @test_vabs_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VABS1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vabs_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VABS1_I]] +// float64x1_t test_vabs_f64(float64x1_t a) { return vabs_f64(a); } -// CHECK-LABEL: @test_vneg_f64( -// CHECK: [[SUB_I:%.*]] = fneg <1 x double> %a -// CHECK: ret <1 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vneg_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <1 x double> [[A]] +// CHECK-NEXT: ret <1 x double> [[FNEG_I]] +// float64x1_t test_vneg_f64(float64x1_t a) { return vneg_f64(a); } -// CHECK-LABEL: @test_vcvt_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// int64x1_t test_vcvt_s64_f64(float64x1_t a) { return vcvt_s64_f64(a); } -// CHECK-LABEL: @test_vcvt_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTZ1_I]] +// uint64x1_t test_vcvt_u64_f64(float64x1_t a) { return vcvt_u64_f64(a); } -// CHECK-LABEL: @test_vcvtn_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTN1_I]] +// int64x1_t test_vcvtn_s64_f64(float64x1_t a) { return vcvtn_s64_f64(a); } -// CHECK-LABEL: @test_vcvtn_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtn_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTN1_I]] +// uint64x1_t test_vcvtn_u64_f64(float64x1_t a) { return vcvtn_u64_f64(a); } -// CHECK-LABEL: @test_vcvtp_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTP1_I]] +// int64x1_t test_vcvtp_s64_f64(float64x1_t a) { return vcvtp_s64_f64(a); } -// CHECK-LABEL: @test_vcvtp_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtp_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTP1_I]] +// uint64x1_t test_vcvtp_u64_f64(float64x1_t a) { return vcvtp_u64_f64(a); } -// CHECK-LABEL: @test_vcvtm_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTM1_I]] +// int64x1_t test_vcvtm_s64_f64(float64x1_t a) { return vcvtm_s64_f64(a); } -// CHECK-LABEL: @test_vcvtm_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvtm_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTM1_I]] +// uint64x1_t test_vcvtm_u64_f64(float64x1_t a) { return vcvtm_u64_f64(a); } -// CHECK-LABEL: @test_vcvta_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTA1_I]] +// int64x1_t test_vcvta_s64_f64(float64x1_t a) { return vcvta_s64_f64(a); } -// CHECK-LABEL: @test_vcvta_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> %a) -// CHECK: ret <1 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvta_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VCVTA1_I]] +// uint64x1_t test_vcvta_u64_f64(float64x1_t a) { return vcvta_u64_f64(a); } -// CHECK-LABEL: @test_vcvt_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_s64(int64x1_t a) { return vcvt_f64_s64(a); } -// CHECK-LABEL: @test_vcvt_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <1 x i64> %a to <1 x double> -// CHECK: ret <1 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <1 x i64> [[A]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[VCVT_I]] +// float64x1_t test_vcvt_f64_u64(uint64x1_t a) { return vcvt_f64_u64(a); } -// CHECK-LABEL: @test_vcvt_n_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) -// CHECK: ret <1 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_s64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[A]], i32 64) +// CHECK-NEXT: ret <1 x i64> [[VCVT_N1]] +// int64x1_t test_vcvt_n_s64_f64(float64x1_t a) { return vcvt_n_s64_f64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64) -// CHECK: ret <1 x i64> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcvt_n_u64_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[A]], i32 64) +// CHECK-NEXT: ret <1 x i64> [[VCVT_N1]] +// uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) { return vcvt_n_u64_f64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) -// CHECK: ret <1 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[A]], i32 64) +// CHECK-NEXT: ret <1 x double> [[VCVT_N1]] +// float64x1_t test_vcvt_n_f64_s64(int64x1_t a) { return vcvt_n_f64_s64(a, 64); } -// CHECK-LABEL: @test_vcvt_n_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64) -// CHECK: ret <1 x double> [[VCVT_N1]] +// CHECK-LABEL: define dso_local <1 x double> @test_vcvt_n_f64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[A]], i32 64) +// CHECK-NEXT: ret <1 x double> [[VCVT_N1]] +// float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) { return vcvt_n_f64_u64(a, 64); } -// CHECK-LABEL: @test_vrndn_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDN1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndn_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <1 x double> @llvm.roundeven.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDN1_I]] +// float64x1_t test_vrndn_f64(float64x1_t a) { return vrndn_f64(a); } -// CHECK-LABEL: @test_vrnda_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDA1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnda_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDA1_I]] +// float64x1_t test_vrnda_f64(float64x1_t a) { return vrnda_f64(a); } -// CHECK-LABEL: @test_vrndp_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDP1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndp_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDP1_I]] +// float64x1_t test_vrndp_f64(float64x1_t a) { return vrndp_f64(a); } -// CHECK-LABEL: @test_vrndm_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDM1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndm_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDM1_I]] +// float64x1_t test_vrndm_f64(float64x1_t a) { return vrndm_f64(a); } -// CHECK-LABEL: @test_vrndx_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDX1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndx_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDX1_I]] +// float64x1_t test_vrndx_f64(float64x1_t a) { return vrndx_f64(a); } -// CHECK-LABEL: @test_vrnd_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDZ1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDZ1_I]] +// float64x1_t test_vrnd_f64(float64x1_t a) { return vrnd_f64(a); } -// CHECK-LABEL: @test_vrndi_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrndi_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRNDI_V1_I]] +// float64x1_t test_vrndi_f64(float64x1_t a) { return vrndi_f64(a); } -// CHECK-LABEL: @test_vrsqrte_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrte_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRSQRTE_V1_I]] +// float64x1_t test_vrsqrte_f64(float64x1_t a) { return vrsqrte_f64(a); } -// CHECK-LABEL: @test_vrecpe_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrecpe_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRECPE_V1_I]] +// float64x1_t test_vrecpe_f64(float64x1_t a) { return vrecpe_f64(a); } -// CHECK-LABEL: @test_vsqrt_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vsqrt_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VSQRT_I]] +// float64x1_t test_vsqrt_f64(float64x1_t a) { return vsqrt_f64(a); } -// CHECK-LABEL: @test_vrecps_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: ret <1 x double> [[VRECPS_V2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrecps_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VRECPS_V2_I]] +// float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) { return vrecps_f64(a, b); } -// CHECK-LABEL: @test_vrsqrts_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> %a, <1 x double> %b) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <1 x double> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrsqrts_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[A]], <1 x double> [[B]]) +// CHECK-NEXT: ret <1 x double> [[VRSQRTS_V2_I]] +// float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) { return vrsqrts_f64(a, b); } -// CHECK-LABEL: @test_vminv_s32( -// CHECK: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMINV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vminv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMINV_S32_I]] +// int32_t test_vminv_s32(int32x2_t a) { return vminv_s32(a); } -// CHECK-LABEL: @test_vminv_u32( -// CHECK: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMINV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vminv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMINV_U32_I]] +// uint32_t test_vminv_u32(uint32x2_t a) { return vminv_u32(a); } -// CHECK-LABEL: @test_vmaxv_s32( -// CHECK: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMAXV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vmaxv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMAXV_S32_I]] +// int32_t test_vmaxv_s32(int32x2_t a) { return vmaxv_s32(a); } -// CHECK-LABEL: @test_vmaxv_u32( -// CHECK: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VMAXV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vmaxv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VMAXV_U32_I]] +// uint32_t test_vmaxv_u32(uint32x2_t a) { return vmaxv_u32(a); } -// CHECK-LABEL: @test_vaddv_s32( -// CHECK: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VADDV_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vaddv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VADDV_S32_I]] +// int32_t test_vaddv_s32(int32x2_t a) { return vaddv_s32(a); } -// CHECK-LABEL: @test_vaddv_u32( -// CHECK: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a) -// CHECK: ret i32 [[VADDV_U32_I]] +// CHECK-LABEL: define dso_local i32 @test_vaddv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i32 [[VADDV_U32_I]] +// uint32_t test_vaddv_u32(uint32x2_t a) { return vaddv_u32(a); } -// CHECK-LABEL: @test_vaddlv_s32( -// CHECK: [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a) -// CHECK: ret i64 [[VADDLV_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddlv_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i64 [[VADDLV_S32_I]] +// int64_t test_vaddlv_s32(int32x2_t a) { return vaddlv_s32(a); } -// CHECK-LABEL: @test_vaddlv_u32( -// CHECK: [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a) -// CHECK: ret i64 [[VADDLV_U32_I]] +// CHECK-LABEL: define dso_local i64 @test_vaddlv_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret i64 [[VADDLV_U32_I]] +// uint64_t test_vaddlv_u32(uint32x2_t a) { return vaddlv_u32(a); } diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c index 40c5a0a598d68..67619ae34b0b7 100644 --- a/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c +++ b/clang/test/CodeGen/AArch64/neon-ldst-one-rcpc3.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64 -target-feature +neon \ // RUN: -target-feature +rcpc3 -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target @@ -10,10 +10,8 @@ // CHECK-LABEL: @test_vldap1q_lane_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1 // CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] // uint64x2_t test_vldap1q_lane_u64(uint64_t *a, uint64x2_t b) { @@ -22,10 +20,8 @@ uint64x2_t test_vldap1q_lane_u64(uint64_t *a, uint64x2_t b) { // CHECK-LABEL: @test_vldap1q_lane_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1 // CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] // int64x2_t test_vldap1q_lane_s64(int64_t *a, int64x2_t b) { @@ -34,10 +30,8 @@ int64x2_t test_vldap1q_lane_s64(int64_t *a, int64x2_t b) { // CHECK-LABEL: @test_vldap1q_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[TMP1]], double [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x double> [[B:%.*]], double [[TMP0]], i64 1 // CHECK-NEXT: ret <2 x double> [[VLDAP1_LANE]] // float64x2_t test_vldap1q_lane_f64(float64_t *a, float64x2_t b) { @@ -46,10 +40,8 @@ float64x2_t test_vldap1q_lane_f64(float64_t *a, float64x2_t b) { // CHECK-LABEL: @test_vldap1q_lane_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[TMP2]], i32 1 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <2 x i64> [[B:%.*]], i64 [[TMP0]], i64 1 // CHECK-NEXT: ret <2 x i64> [[VLDAP1_LANE]] // poly64x2_t test_vldap1q_lane_p64(poly64_t *a, poly64x2_t b) { @@ -58,10 +50,8 @@ poly64x2_t test_vldap1q_lane_p64(poly64_t *a, poly64x2_t b) { // CHECK-LABEL: @test_vldap1_lane_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 // CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] // uint64x1_t test_vldap1_lane_u64(uint64_t *a, uint64x1_t b) { @@ -70,10 +60,8 @@ uint64x1_t test_vldap1_lane_u64(uint64_t *a, uint64x1_t b) { // CHECK-LABEL: @test_vldap1_lane_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 // CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] // int64x1_t test_vldap1_lane_s64(int64_t *a, int64x1_t b) { @@ -82,10 +70,8 @@ int64x1_t test_vldap1_lane_s64(int64_t *a, int64x1_t b) { // CHECK-LABEL: @test_vldap1_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x double> [[TMP1]], double [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic double, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0 // CHECK-NEXT: ret <1 x double> [[VLDAP1_LANE]] // float64x1_t test_vldap1_lane_f64(float64_t *a, float64x1_t b) { @@ -94,10 +80,8 @@ float64x1_t test_vldap1_lane_f64(float64_t *a, float64x1_t b) { // CHECK-LABEL: @test_vldap1_lane_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 -// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8 +// CHECK-NEXT: [[VLDAP1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 // CHECK-NEXT: ret <1 x i64> [[VLDAP1_LANE]] // poly64x1_t test_vldap1_lane_p64(poly64_t *a, poly64x1_t b) { @@ -106,10 +90,8 @@ poly64x1_t test_vldap1_lane_p64(poly64_t *a, poly64x1_t b) { // CHECK-LABEL: @test_vstl1q_lane_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1q_lane_u64(uint64_t *a, uint64x2_t b) { @@ -118,10 +100,8 @@ void test_vstl1q_lane_u64(uint64_t *a, uint64x2_t b) { // CHECK-LABEL: @test_vstl1q_lane_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1q_lane_s64(int64_t *a, int64x2_t b) { @@ -130,10 +110,8 @@ void test_vstl1q_lane_s64(int64_t *a, int64x2_t b) { // CHECK-LABEL: @test_vstl1q_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1 +// CHECK-NEXT: store atomic double [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1q_lane_f64(float64_t *a, float64x2_t b) { @@ -142,10 +120,8 @@ void test_vstl1q_lane_f64(float64_t *a, float64x2_t b) { // CHECK-LABEL: @test_vstl1q_lane_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B:%.*]], i64 1 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1q_lane_p64(poly64_t *a, poly64x2_t b) { @@ -154,10 +130,8 @@ void test_vstl1q_lane_p64(poly64_t *a, poly64x2_t b) { // CHECK-LABEL: @test_vstl1_lane_u64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1_lane_u64(uint64_t *a, uint64x1_t b) { @@ -166,10 +140,8 @@ void test_vstl1_lane_u64(uint64_t *a, uint64x1_t b) { // CHECK-LABEL: @test_vstl1_lane_s64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1_lane_s64(int64_t *a, int64x1_t b) { @@ -178,10 +150,8 @@ void test_vstl1_lane_s64(int64_t *a, int64x1_t b) { // CHECK-LABEL: @test_vstl1_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK-NEXT: store atomic double [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[B:%.*]], i64 0 +// CHECK-NEXT: store atomic double [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1_lane_f64(float64_t *a, float64x1_t b) { @@ -190,10 +160,8 @@ void test_vstl1_lane_f64(float64_t *a, float64x1_t b) { // CHECK-LABEL: @test_vstl1_lane_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK-NEXT: [[TMP2:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 -// CHECK-NEXT: store atomic i64 [[TMP2]], ptr [[A:%.*]] release, align 8 +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B:%.*]], i64 0 +// CHECK-NEXT: store atomic i64 [[TMP0]], ptr [[A:%.*]] release, align 8 // CHECK-NEXT: ret void // void test_vstl1_lane_p64(poly64_t *a, poly64x1_t b) { diff --git a/clang/test/CodeGen/AArch64/neon-ldst-one.c b/clang/test/CodeGen/AArch64/neon-ldst-one.c index b57df40d8e5c9..b273428ccba96 100644 --- a/clang/test/CodeGen/AArch64/neon-ldst-one.c +++ b/clang/test/CodeGen/AArch64/neon-ldst-one.c @@ -1,5757 +1,7550 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vld1q_dup_u8(uint8_t *a) { return vld1q_dup_u8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vld1q_dup_u16(uint16_t *a) { return vld1q_dup_u16(a); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vld1q_dup_u32(uint32_t *a) { return vld1q_dup_u32(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vld1q_dup_u64(uint64_t *a) { return vld1q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vld1q_dup_s8(int8_t *a) { return vld1q_dup_s8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vld1q_dup_s16(int16_t *a) { return vld1q_dup_s16(a); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vld1q_dup_s32(int32_t *a) { return vld1q_dup_s32(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vld1q_dup_s64(int64_t *a) { return vld1q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load half, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x half> [[LANE]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[LANE]] +// float16x8_t test_vld1q_dup_f16(float16_t *a) { return vld1q_dup_f16(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load float, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vld1q_dup_f32(float32_t *a) { return vld1q_dup_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load double, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x double> [[LANE]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[LANE]] +// float64x2_t test_vld1q_dup_f64(float64_t *a) { return vld1q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vld1q_dup_p8(poly8_t *a) { return vld1q_dup_p8(a); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vld1q_dup_p16(poly16_t *a) { return vld1q_dup_p16(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vld1q_dup_p64(poly64_t *a) { return vld1q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vld1_dup_u8(uint8_t *a) { return vld1_dup_u8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vld1_dup_u16(uint16_t *a) { return vld1_dup_u16(a); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vld1_dup_u32(uint32_t *a) { return vld1_dup_u32(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP1]] +// uint64x1_t test_vld1_dup_u64(uint64_t *a) { return vld1_dup_u64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vld1_dup_s8(int8_t *a) { return vld1_dup_s8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vld1_dup_s16(int16_t *a) { return vld1_dup_s16(a); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i32, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vld1_dup_s32(int32_t *a) { return vld1_dup_s32(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP1]] +// int64x1_t test_vld1_dup_s64(int64_t *a) { return vld1_dup_s64(a); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load half, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x half> [[LANE]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[LANE]] +// float16x4_t test_vld1_dup_f16(float16_t *a) { return vld1_dup_f16(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load float, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vld1_dup_f32(float32_t *a) { return vld1_dup_f32(a); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load double, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x double> [[LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x double> [[TMP1]] +// float64x1_t test_vld1_dup_f64(float64_t *a) { return vld1_dup_f64(a); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(ptr noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vld1_dup_p8(poly8_t *a) { return vld1_dup_p8(a); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i16, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vld1_dup_p16(poly16_t *a) { return vld1_dup_p16(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(ptr noundef %a) #0 { -// CHECK: [[TMP2:%.*]] = load i64, ptr %a -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP1]] +// poly64x1_t test_vld1_dup_p64(poly64_t *a) { return vld1_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[TMP1]] +// uint64x2x2_t test_vld2q_dup_u64(uint64_t *a) { return vld2q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[TMP1]] +// int64x2x2_t test_vld2q_dup_s64(int64_t *a) { return vld2q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x double> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[TMP1]] +// float64x2x2_t test_vld2q_dup_f64(float64_t *a) { return vld2q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[TMP1]] +// poly64x2x2_t test_vld2q_dup_p64(poly64_t *a) { return vld2q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x double> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x double> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[TMP1]] +// float64x1x2_t test_vld2_dup_f64(float64_t *a) { return vld2_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[TMP1]] +// poly64x1x2_t test_vld2_dup_p64(poly64_t *a) { return vld2_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[TMP2]] +// uint64x2x3_t test_vld3q_dup_u64(uint64_t *a) { return vld3q_dup_u64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[TMP2]] +// int64x2x3_t test_vld3q_dup_s64(int64_t *a) { return vld3q_dup_s64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[TMP2]] +// float64x2x3_t test_vld3q_dup_f64(float64_t *a) { return vld3q_dup_f64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[TMP2]] +// poly64x2x3_t test_vld3q_dup_p64(poly64_t *a) { return vld3q_dup_p64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x double> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[TMP2]] +// float64x1x3_t test_vld3_dup_f64(float64_t *a) { return vld3_dup_f64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[TMP2]] +// poly64x1x3_t test_vld3_dup_p64(poly64_t *a) { return vld3_dup_p64(a); // [{{x[0-9]+|sp}}] } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[TMP3]] +// uint64x2x4_t test_vld4q_dup_u64(uint64_t *a) { return vld4q_dup_u64(a); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[TMP3]] +// int64x2x4_t test_vld4q_dup_s64(int64_t *a) { return vld4q_dup_s64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x double>] [[TMP0]], <2 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x double>] [[TMP1]], <2 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[TMP3]] +// float64x2x4_t test_vld4q_dup_f64(float64_t *a) { return vld4q_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[TMP3]] +// poly64x2x4_t test_vld4q_dup_p64(poly64_t *a) { return vld4q_dup_p64(a); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_dup_f64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x double> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x double>] [[TMP0]], <1 x double> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x double>] [[TMP1]], <1 x double> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[TMP3]] +// float64x1x4_t test_vld4_dup_f64(float64_t *a) { return vld4_dup_f64(a); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(ptr noundef %a) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_dup_p64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0(ptr [[A]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[TMP3]] +// poly64x1x4_t test_vld4_dup_p64(poly64_t *a) { return vld4_dup_p64(a); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// uint8x16_t test_vld1q_lane_u8(uint8_t *a, uint8x16_t b) { return vld1q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// uint16x8_t test_vld1q_lane_u16(uint16_t *a, uint16x8_t b) { return vld1q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// uint32x4_t test_vld1q_lane_u32(uint32_t *a, uint32x4_t b) { return vld1q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// uint64x2_t test_vld1q_lane_u64(uint64_t *a, uint64x2_t b) { return vld1q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// int8x16_t test_vld1q_lane_s8(int8_t *a, int8x16_t b) { return vld1q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// int16x8_t test_vld1q_lane_s16(int16_t *a, int16x8_t b) { return vld1q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vld1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// int32x4_t test_vld1q_lane_s32(int32_t *a, int32x4_t b) { return vld1q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// int64x2_t test_vld1q_lane_s64(int64_t *a, int64x2_t b) { return vld1q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7 -// CHECK: ret <8 x half> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x half> @test_vld1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[B]], half [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x half> [[VLD1_LANE]] +// float16x8_t test_vld1q_lane_f16(float16_t *a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 -// CHECK: ret <4 x float> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x float> @test_vld1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[B]], float [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x float> [[VLD1_LANE]] +// float32x4_t test_vld1q_lane_f32(float32_t *a, float32x4_t b) { return vld1q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[TMP4:%.*]] = load double, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1 -// CHECK: ret <2 x double> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x double> @test_vld1q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x double> [[B]], double [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x double> [[VLD1_LANE]] +// float64x2_t test_vld1q_lane_f64(float64_t *a, float64x2_t b) { return vld1q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vld1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// poly8x16_t test_vld1q_lane_p8(poly8_t *a, poly8x16_t b) { return vld1q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vld1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// poly16x8_t test_vld1q_lane_p16(poly16_t *a, poly16x8_t b) { return vld1q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1 -// CHECK: ret <2 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VLD1_LANE]] +// poly64x2_t test_vld1q_lane_p64(poly64_t *a, poly64x2_t b) { return vld1q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// uint8x8_t test_vld1_lane_u8(uint8_t *a, uint8x8_t b) { return vld1_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// uint16x4_t test_vld1_lane_u16(uint16_t *a, uint16x4_t b) { return vld1_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// uint32x2_t test_vld1_lane_u32(uint32_t *a, uint32x2_t b) { return vld1_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// uint64x1_t test_vld1_lane_u64(uint64_t *a, uint64x1_t b) { return vld1_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// int8x8_t test_vld1_lane_s8(int8_t *a, int8x8_t b) { return vld1_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// int16x4_t test_vld1_lane_s16(int16_t *a, int16x4_t b) { return vld1_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vld1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// int32x2_t test_vld1_lane_s32(int32_t *a, int32x2_t b) { return vld1_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// int64x1_t test_vld1_lane_s64(int64_t *a, int64x1_t b) { return vld1_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3 -// CHECK: ret <4 x half> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x half> @test_vld1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[B]], half [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x half> [[VLD1_LANE]] +// float16x4_t test_vld1_lane_f16(float16_t *a, float16x4_t b) { return vld1_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 -// CHECK: ret <2 x float> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <2 x float> @test_vld1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[B]], float [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x float> [[VLD1_LANE]] +// float32x2_t test_vld1_lane_f32(float32_t *a, float32x2_t b) { return vld1_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[TMP4:%.*]] = load double, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0 -// CHECK: ret <1 x double> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vld1_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VLD1_LANE]] +// float64x1_t test_vld1_lane_f64(float64_t *a, float64x1_t b) { return vld1_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = load i8, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vld1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// poly8x8_t test_vld1_lane_p8(poly8_t *a, poly8x8_t b) { return vld1_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vld1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// poly16x4_t test_vld1_lane_p16(poly16_t *a, poly16x4_t b) { return vld1_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 8 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// poly64x1_t test_vld1_lane_p64(poly64_t *a, poly64x1_t b) { return vld1_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vld2q_lane_s8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16 +// CHECK-NEXT: [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16 +// CHECK-NEXT: [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[TMP3]] +// int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) { return vld2q_lane_s8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vld2q_lane_u8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16 +// CHECK-NEXT: [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16 +// CHECK-NEXT: [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[TMP3]] +// uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) { return vld2q_lane_u8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(ptr noundef %ptr, [2 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vld2q_lane_p8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[SRC_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16 +// CHECK-NEXT: [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16 +// CHECK-NEXT: [[SRC_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[SRC]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <16 x i8>] [[TMP2]], <16 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T]] poison, [2 x <16 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[TMP3]] +// poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) { return vld2q_lane_p8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.int8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.int8x16x3_t @test_vld3q_lane_s8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[SRC_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16 +// CHECK-NEXT: [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16 +// CHECK-NEXT: [[SRC_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16 +// CHECK-NEXT: [[SRC_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 32 +// CHECK-NEXT: [[SRC_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT4]], ptr [[SRC_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[SRC]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X3_T]] [[TMP5]] +// int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) { return vld3q_lane_s8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(ptr noundef %ptr, [3 x <16 x i8>] alignstack(16) %src.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[SRC]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[SRC]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[SRC]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %ptr) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.uint8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.uint8x16x3_t @test_vld3q_lane_u8( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <16 x i8>] alignstack(16) [[SRC_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[SRC_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT]], ptr [[SRC]], align 16 +// CHECK-NEXT: [[SRC_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 16 +// CHECK-NEXT: [[SRC_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT2]], ptr [[SRC_REPACK1]], align 16 +// CHECK-NEXT: [[SRC_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 32 +// CHECK-NEXT: [[SRC_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[SRC_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[SRC_COERCE_ELT4]], ptr [[SRC_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[SRC]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X3_T]] [[TMP5]] +// uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) { return vld3q_lane_u8(ptr, src, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vld2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[TMP3]] +// uint16x8x2_t test_vld2q_lane_u16(uint16_t *a, uint16x8x2_t b) { return vld2q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vld2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[TMP3]] +// uint32x4x2_t test_vld2q_lane_u32(uint32_t *a, uint32x4x2_t b) { return vld2q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint64x2x2_t @test_vld2q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X2_T]] [[TMP3]] +// uint64x2x2_t test_vld2q_lane_u64(uint64_t *a, uint64x2x2_t b) { return vld2q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vld2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[TMP3]] +// int16x8x2_t test_vld2q_lane_s16(int16_t *a, int16x8x2_t b) { return vld2q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vld2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i32>] [[TMP2]], <4 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] poison, [2 x <4 x i32>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[TMP3]] +// int32x4x2_t test_vld2q_lane_s32(int32_t *a, int32x4x2_t b) { return vld2q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int64x2x2_t @test_vld2q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X2_T]] [[TMP3]] +// int64x2x2_t test_vld2q_lane_s64(int64_t *a, int64x2x2_t b) { return vld2q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float16x8x2_t @test_vld2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x half> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x half>] [[TMP2]], <8 x half> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T]] poison, [2 x <8 x half>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP3]] +// float16x8x2_t test_vld2q_lane_f16(float16_t *a, float16x8x2_t b) { return vld2q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vld2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x float> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x float>] [[TMP2]], <4 x float> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T]] poison, [2 x <4 x float>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[TMP3]] +// float32x4x2_t test_vld2q_lane_f32(float32_t *a, float32x4x2_t b) { return vld2q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float64x2x2_t @test_vld2q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x double> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x double>, <2 x double> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x double> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x double>] [[TMP2]], <2 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X2X2_T]] poison, [2 x <2 x double>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X2_T]] [[TMP3]] +// float64x2x2_t test_vld2q_lane_f64(float64_t *a, float64x2x2_t b) { return vld2q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vld2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i16>] [[TMP2]], <8 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T]] poison, [2 x <8 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[TMP3]] +// poly16x8x2_t test_vld2q_lane_p16(poly16_t *a, poly16x8x2_t b) { return vld2q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[TMP3]] +// poly64x2x2_t test_vld2q_lane_p64(poly64_t *a, poly64x2x2_t b) { return vld2q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vld2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[TMP3]] +// uint8x8x2_t test_vld2_lane_u8(uint8_t *a, uint8x8x2_t b) { return vld2_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vld2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[TMP3]] +// uint16x4x2_t test_vld2_lane_u16(uint16_t *a, uint16x4x2_t b) { return vld2_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vld2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[TMP3]] +// uint32x2x2_t test_vld2_lane_u32(uint32_t *a, uint32x2x2_t b) { return vld2_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.uint64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.uint64x1x2_t @test_vld2_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_UINT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X2_T]] [[TMP3]] +// uint64x1x2_t test_vld2_lane_u64(uint64_t *a, uint64x1x2_t b) { return vld2_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vld2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[TMP3]] +// int8x8x2_t test_vld2_lane_s8(int8_t *a, int8x8x2_t b) { return vld2_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vld2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[TMP3]] +// int16x4x2_t test_vld2_lane_s16(int16_t *a, int16x4x2_t b) { return vld2_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vld2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x i32>] [[TMP2]], <2 x i32> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT32X2X2_T]] poison, [2 x <2 x i32>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[TMP3]] +// int32x2x2_t test_vld2_lane_s32(int32_t *a, int32x2x2_t b) { return vld2_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.int64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.int64x1x2_t @test_vld2_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_INT64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X2_T]] [[TMP3]] +// int64x1x2_t test_vld2_lane_s64(int64_t *a, int64x1x2_t b) { return vld2_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float16x4x2_t @test_vld2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x half>] [[TMP2]], <4 x half> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T]] poison, [2 x <4 x half>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP3]] +// float16x4x2_t test_vld2_lane_f16(float16_t *a, float16x4x2_t b) { return vld2_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vld2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <2 x float>] [[TMP2]], <2 x float> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T]] poison, [2 x <2 x float>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[TMP3]] +// float32x2x2_t test_vld2_lane_f32(float32_t *a, float32x2x2_t b) { return vld2_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.float64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.float64x1x2_t @test_vld2_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <1 x double> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x double>, <1 x double> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <1 x double> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x double>] [[TMP2]], <1 x double> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_FLOAT64X1X2_T]] poison, [2 x <1 x double>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X2_T]] [[TMP3]] +// float64x1x2_t test_vld2_lane_f64(float64_t *a, float64x1x2_t b) { return vld2_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vld2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <8 x i8>] [[TMP2]], <8 x i8> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T]] poison, [2 x <8 x i8>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[TMP3]] +// poly8x8x2_t test_vld2_lane_p8(poly8_t *a, poly8x8x2_t b) { return vld2_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vld2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <4 x i16>] [[TMP2]], <4 x i16> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T]] poison, [2 x <4 x i16>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[TMP3]] +// poly16x4x2_t test_vld2_lane_p16(poly16_t *a, poly16x4x2_t b) { return vld2_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP13:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP13]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD2_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_LANE_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_LANE_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[DOTUNPACK7:%.*]] = insertvalue [2 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK7]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[TMP3]] +// poly64x1x2_t test_vld2_lane_p64(poly64_t *a, poly64x1x2_t b) { return vld2_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint16x8x3_t @test_vld3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X3_T]] [[TMP5]] +// uint16x8x3_t test_vld3q_lane_u16(uint16_t *a, uint16x8x3_t b) { return vld3q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint32x4x3_t @test_vld3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x i32>] [[TMP3]], <4 x i32> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X3_T]] [[TMP5]] +// uint32x4x3_t test_vld3q_lane_u32(uint32_t *a, uint32x4x3_t b) { return vld3q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint64x2x3_t @test_vld3q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X3_T]] [[TMP5]] +// uint64x2x3_t test_vld3q_lane_u64(uint64_t *a, uint64x2x3_t b) { return vld3q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int16x8x3_t @test_vld3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X3_T]] [[TMP5]] +// int16x8x3_t test_vld3q_lane_s16(int16_t *a, int16x8x3_t b) { return vld3q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int32x4x3_t @test_vld3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x i32>] [[TMP3]], <4 x i32> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT32X4X3_T]] poison, [3 x <4 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X3_T]] [[TMP5]] +// int32x4x3_t test_vld3q_lane_s32(int32_t *a, int32x4x3_t b) { return vld3q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int64x2x3_t @test_vld3q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X3_T]] [[TMP5]] +// int64x2x3_t test_vld3q_lane_s64(int64_t *a, int64x2x3_t b) { return vld3q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float16x8x3_t @test_vld3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x half> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x half> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x half>] [[TMP3]], <8 x half> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x half>] [[TMP4]], <8 x half> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT16X8X3_T]] poison, [3 x <8 x half>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X3_T]] [[TMP5]] +// float16x8x3_t test_vld3q_lane_f16(float16_t *a, float16x8x3_t b) { return vld3q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float32x4x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float32x4x3_t @test_vld3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x float> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x float> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x float>] [[TMP3]], <4 x float> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x float>] [[TMP4]], <4 x float> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT32X4X3_T]] poison, [3 x <4 x float>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X3_T]] [[TMP5]] +// float32x4x3_t test_vld3q_lane_f32(float32_t *a, float32x4x3_t b) { return vld3q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float64x2x3_t @test_vld3q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x double> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x double> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x double> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x double>] [[TMP3]], <2 x double> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x double>] [[TMP4]], <2 x double> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT64X2X3_T]] poison, [3 x <2 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X3_T]] [[TMP5]] +// float64x2x3_t test_vld3q_lane_f64(float64_t *a, float64x2x3_t b) { return vld3q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.poly8x16x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.poly8x16x3_t @test_vld3q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <16 x i8>] [[TMP3]], <16 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY8X16X3_T]] poison, [3 x <16 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X3_T]] [[TMP5]] +// poly8x16x3_t test_vld3q_lane_p8(poly8_t *a, poly8x16x3_t b) { return vld3q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly16x8x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly16x8x3_t @test_vld3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i16>] [[TMP3]], <8 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY16X8X3_T]] poison, [3 x <8 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X3_T]] [[TMP5]] +// poly16x8x3_t test_vld3q_lane_p16(poly16_t *a, poly16x8x3_t b) { return vld3q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x i64>] [[TMP3]], <2 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[TMP5]] +// poly64x2x3_t test_vld3q_lane_p64(poly64_t *a, poly64x2x3_t b) { return vld3q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.uint8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.uint8x8x3_t @test_vld3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X3_T]] [[TMP5]] +// uint8x8x3_t test_vld3_lane_u8(uint8_t *a, uint8x8x3_t b) { return vld3_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint16x4x3_t @test_vld3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X3_T]] [[TMP5]] +// uint16x4x3_t test_vld3_lane_u16(uint16_t *a, uint16x4x3_t b) { return vld3_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint32x2x3_t @test_vld3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x i32>] [[TMP3]], <2 x i32> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X3_T]] [[TMP5]] +// uint32x2x3_t test_vld3_lane_u32(uint32_t *a, uint32x2x3_t b) { return vld3_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.uint64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.uint64x1x3_t @test_vld3_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_UINT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X3_T]] [[TMP5]] +// uint64x1x3_t test_vld3_lane_u64(uint64_t *a, uint64x1x3_t b) { return vld3_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.int8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.int8x8x3_t @test_vld3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X3_T]] [[TMP5]] +// int8x8x3_t test_vld3_lane_s8(int8_t *a, int8x8x3_t b) { return vld3_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int16x4x3_t @test_vld3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X3_T]] [[TMP5]] +// int16x4x3_t test_vld3_lane_s16(int16_t *a, int16x4x3_t b) { return vld3_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int32x2x3_t @test_vld3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x i32>] [[TMP3]], <2 x i32> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT32X2X3_T]] poison, [3 x <2 x i32>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X3_T]] [[TMP5]] +// int32x2x3_t test_vld3_lane_s32(int32_t *a, int32x2x3_t b) { return vld3_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.int64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.int64x1x3_t @test_vld3_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_INT64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X3_T]] [[TMP5]] +// int64x1x3_t test_vld3_lane_s64(int64_t *a, int64x1x3_t b) { return vld3_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float16x4x3_t @test_vld3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x half>] [[TMP3]], <4 x half> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x half>] [[TMP4]], <4 x half> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT16X4X3_T]] poison, [3 x <4 x half>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X3_T]] [[TMP5]] +// float16x4x3_t test_vld3_lane_f16(float16_t *a, float16x4x3_t b) { return vld3_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float32x2x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float32x2x3_t @test_vld3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <2 x float>] [[TMP3]], <2 x float> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <2 x float>] [[TMP4]], <2 x float> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT32X2X3_T]] poison, [3 x <2 x float>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X3_T]] [[TMP5]] +// float32x2x3_t test_vld3_lane_f32(float32_t *a, float32x2x3_t b) { return vld3_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.float64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.float64x1x3_t @test_vld3_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <1 x double> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <1 x double> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <1 x double> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <1 x double>] [[TMP3]], <1 x double> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x double>] [[TMP4]], <1 x double> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_FLOAT64X1X3_T]] poison, [3 x <1 x double>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X3_T]] [[TMP5]] +// float64x1x3_t test_vld3_lane_f64(float64_t *a, float64x1x3_t b) { return vld3_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP9:%.*]] = load %struct.poly8x8x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x3_t [[TMP9]] +// CHECK-LABEL: define dso_local %struct.poly8x8x3_t @test_vld3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <8 x i8>] [[TMP3]], <8 x i8> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY8X8X3_T]] poison, [3 x <8 x i8>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X3_T]] [[TMP5]] +// poly8x8x3_t test_vld3_lane_p8(poly8_t *a, poly8x8x3_t b) { return vld3_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly16x4x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly16x4x3_t @test_vld3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <4 x i16>] [[TMP3]], <4 x i16> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY16X4X3_T]] poison, [3 x <4 x i16>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X3_T]] [[TMP5]] +// poly16x4x3_t test_vld3_lane_p16(poly16_t *a, poly16x4x3_t b) { return vld3_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP16:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP16]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD3_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_LANE_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [3 x <1 x i64>] [[TMP3]], <1 x i64> [[DOTUNPACK_UNPACK10]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [3 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK12]], 2 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[TMP5]] +// poly64x1x3_t test_vld3_lane_p64(poly64_t *a, poly64x1x3_t b) { return vld3_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.uint8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint8x16x4_t @test_vld4q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X4_T]] [[TMP7]] +// uint8x16x4_t test_vld4q_lane_u8(uint8_t *a, uint8x16x4_t b) { return vld4q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint16x8x4_t @test_vld4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X4_T]] [[TMP7]] +// uint16x8x4_t test_vld4q_lane_u16(uint16_t *a, uint16x8x4_t b) { return vld4q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint32x4x4_t @test_vld4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x i32>] [[TMP5]], <4 x i32> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i32>] [[TMP6]], <4 x i32> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X4_T]] [[TMP7]] +// uint32x4x4_t test_vld4q_lane_u32(uint32_t *a, uint32x4x4_t b) { return vld4q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint64x2x4_t @test_vld4q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X2X4_T]] [[TMP7]] +// uint64x2x4_t test_vld4q_lane_u64(uint64_t *a, uint64x2x4_t b) { return vld4q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.int8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int8x16x4_t @test_vld4q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X4_T]] [[TMP7]] +// int8x16x4_t test_vld4q_lane_s8(int8_t *a, int8x16x4_t b) { return vld4q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int16x8x4_t @test_vld4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X4_T]] [[TMP7]] +// int16x8x4_t test_vld4q_lane_s16(int16_t *a, int16x8x4_t b) { return vld4q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int32x4x4_t @test_vld4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x i32>] [[TMP4]], <4 x i32> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x i32>] [[TMP5]], <4 x i32> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i32>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i32>] [[TMP6]], <4 x i32> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT32X4X4_T]] poison, [4 x <4 x i32>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X4_T]] [[TMP7]] +// int32x4x4_t test_vld4q_lane_s32(int32_t *a, int32x4x4_t b) { return vld4q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int64x2x4_t @test_vld4q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X2X4_T]] [[TMP7]] +// int64x2x4_t test_vld4q_lane_s64(int64_t *a, int64x2x4_t b) { return vld4q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float16x8x4_t @test_vld4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x half> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x half> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x half> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x half>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x half>] poison, <8 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x half>] [[TMP4]], <8 x half> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x half>] [[TMP5]], <8 x half> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x half>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x half>] [[TMP6]], <8 x half> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT16X8X4_T]] poison, [4 x <8 x half>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X4_T]] [[TMP7]] +// float16x8x4_t test_vld4q_lane_f16(float16_t *a, float16x8x4_t b) { return vld4q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float32x4x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float32x4x4_t @test_vld4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x float> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x float> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x float> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x float>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x float>] poison, <4 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x float>] [[TMP4]], <4 x float> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x float>] [[TMP5]], <4 x float> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x float>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x float>] [[TMP6]], <4 x float> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT32X4X4_T]] poison, [4 x <4 x float>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X4_T]] [[TMP7]] +// float32x4x4_t test_vld4q_lane_f32(float32_t *a, float32x4x4_t b) { return vld4q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float64x2x4_t @test_vld4q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x double> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x double> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x double> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x double> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x double>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x double>] poison, <2 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x double>] [[TMP4]], <2 x double> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x double>] [[TMP5]], <2 x double> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x double>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x double>] [[TMP6]], <2 x double> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT64X2X4_T]] poison, [4 x <2 x double>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X2X4_T]] [[TMP7]] +// float64x2x4_t test_vld4q_lane_f64(float64_t *a, float64x2x4_t b) { return vld4q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, ptr %a) -// CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.poly8x16x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly8x16x4_t @test_vld4q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <16 x i8>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <16 x i8>] [[TMP4]], <16 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <16 x i8>] [[TMP5]], <16 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <16 x i8>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <16 x i8>] [[TMP6]], <16 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY8X16X4_T]] poison, [4 x <16 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X4_T]] [[TMP7]] +// poly8x16x4_t test_vld4q_lane_p8(poly8_t *a, poly8x16x4_t b) { return vld4q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, ptr %a) -// CHECK: store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly16x8x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly16x8x4_t @test_vld4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i16>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i16>] poison, <8 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i16>] [[TMP4]], <8 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i16>] [[TMP5]], <8 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i16>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i16>] [[TMP6]], <8 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY16X8X4_T]] poison, [4 x <8 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X4_T]] [[TMP7]] +// poly16x8x4_t test_vld4q_lane_p16(poly16_t *a, poly16x8x4_t b) { return vld4q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 16 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 16 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT13]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x i64>] [[TMP4]], <2 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT15]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x i64>] [[TMP5]], <2 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT17]], align 16 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i64>] [[TMP6]], <2 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[TMP7]] +// poly64x2x4_t test_vld4q_lane_p64(poly64_t *a, poly64x2x4_t b) { return vld4q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.uint8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint8x8x4_t @test_vld4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X4_T]] [[TMP7]] +// uint8x8x4_t test_vld4_lane_u8(uint8_t *a, uint8x8x4_t b) { return vld4_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint16x4x4_t @test_vld4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X4_T]] [[TMP7]] +// uint16x4x4_t test_vld4_lane_u16(uint16_t *a, uint16x4x4_t b) { return vld4_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint32x2x4_t @test_vld4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x i32>] [[TMP5]], <2 x i32> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i32>] [[TMP6]], <2 x i32> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X4_T]] [[TMP7]] +// uint32x2x4_t test_vld4_lane_u32(uint32_t *a, uint32x2x4_t b) { return vld4_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.uint64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.uint64x1x4_t @test_vld4_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_UINT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT64X1X4_T]] [[TMP7]] +// uint64x1x4_t test_vld4_lane_u64(uint64_t *a, uint64x1x4_t b) { return vld4_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.int8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int8x8x4_t @test_vld4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X4_T]] [[TMP7]] +// int8x8x4_t test_vld4_lane_s8(int8_t *a, int8x8x4_t b) { return vld4_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int16x4x4_t @test_vld4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X4_T]] [[TMP7]] +// int16x4x4_t test_vld4_lane_s16(int16_t *a, int16x4x4_t b) { return vld4_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int32x2x4_t @test_vld4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x i32>] [[TMP4]], <2 x i32> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x i32>] [[TMP5]], <2 x i32> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x i32>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x i32>] [[TMP6]], <2 x i32> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT32X2X4_T]] poison, [4 x <2 x i32>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X4_T]] [[TMP7]] +// int32x2x4_t test_vld4_lane_s32(int32_t *a, int32x2x4_t b) { return vld4_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.int64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.int64x1x4_t @test_vld4_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_INT64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_INT64X1X4_T]] [[TMP7]] +// int64x1x4_t test_vld4_lane_s64(int64_t *a, int64x1x4_t b) { return vld4_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float16x4x4_t @test_vld4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x half>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x half>] poison, <4 x half> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x half>] [[TMP4]], <4 x half> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x half>] [[TMP5]], <4 x half> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x half>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x half>] [[TMP6]], <4 x half> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT16X4X4_T]] poison, [4 x <4 x half>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X4_T]] [[TMP7]] +// float16x4x4_t test_vld4_lane_f16(float16_t *a, float16x4x4_t b) { return vld4_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, ptr %a) -// CHECK: store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float32x2x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float32x2x4_t @test_vld4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x float>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <2 x float>] poison, <2 x float> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <2 x float>] [[TMP4]], <2 x float> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <2 x float>] [[TMP5]], <2 x float> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <2 x float>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <2 x float>] [[TMP6]], <2 x float> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT32X2X4_T]] poison, [4 x <2 x float>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X4_T]] [[TMP7]] +// float32x2x4_t test_vld4_lane_f32(float32_t *a, float32x2x4_t b) { return vld4_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.float64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.float64x1x4_t @test_vld4_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <1 x double> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <1 x double> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <1 x double> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <1 x double> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x double>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <1 x double>] poison, <1 x double> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <1 x double>] [[TMP4]], <1 x double> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <1 x double>] [[TMP5]], <1 x double> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <1 x double>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x double>] [[TMP6]], <1 x double> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_FLOAT64X1X4_T]] poison, [4 x <1 x double>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT64X1X4_T]] [[TMP7]] +// float64x1x4_t test_vld4_lane_f64(float64_t *a, float64x1x4_t b) { return vld4_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, ptr %a) -// CHECK: store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP10:%.*]] = load %struct.poly8x8x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x4_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly8x8x4_t @test_vld4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <8 x i8>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <8 x i8>] [[TMP4]], <8 x i8> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <8 x i8>] [[TMP5]], <8 x i8> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <8 x i8>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <8 x i8>] [[TMP6]], <8 x i8> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY8X8X4_T]] poison, [4 x <8 x i8>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X4_T]] [[TMP7]] +// poly8x8x4_t test_vld4_lane_p8(poly8_t *a, poly8x8x4_t b) { return vld4_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, ptr %a) -// CHECK: store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly16x4x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly16x4x4_t @test_vld4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <4 x i16>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <4 x i16>] poison, <4 x i16> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <4 x i16>] [[TMP4]], <4 x i16> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <4 x i16>] [[TMP5]], <4 x i16> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <4 x i16>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <4 x i16>] [[TMP6]], <4 x i16> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY16X4X4_T]] poison, [4 x <4 x i16>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X4_T]] [[TMP7]] +// poly16x4x4_t test_vld4_lane_p16(poly16_t *a, poly16x4x4_t b) { return vld4_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, ptr %a) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP19:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP19]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: [[VLD4_LANE_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK7:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_LANE_ELT8:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT8]], ptr [[__RET_REPACK7]], align 8 +// CHECK-NEXT: [[__RET_REPACK9:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_LANE_ELT10:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT10]], ptr [[__RET_REPACK9]], align 8 +// CHECK-NEXT: [[__RET_REPACK11:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_LANE_ELT12:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_LANE_ELT12]], ptr [[__RET_REPACK11]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT13:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK14:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT13]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = insertvalue [4 x <1 x i64>] [[TMP4]], <1 x i64> [[DOTUNPACK_UNPACK14]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT15:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK16:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT15]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue [4 x <1 x i64>] [[TMP5]], <1 x i64> [[DOTUNPACK_UNPACK16]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT17:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK18:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT17]], align 8 +// CHECK-NEXT: [[DOTUNPACK19:%.*]] = insertvalue [4 x <1 x i64>] [[TMP6]], <1 x i64> [[DOTUNPACK_UNPACK18]], 3 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK19]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[TMP7]] +// poly64x1x4_t test_vld4_lane_p64(poly64_t *a, poly64x1x4_t b) { return vld4_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u8(uint8_t *a, uint8x16_t b) { vst1q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u16(uint16_t *a, uint16x8_t b) { vst1q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u32(uint32_t *a, uint32x4_t b) { vst1q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u64(uint64_t *a, uint64x2_t b) { vst1q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s8(int8_t *a, int8x16_t b) { vst1q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s16(int16_t *a, int16x8_t b) { vst1q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(ptr noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s32(int32_t *a, int32x4_t b) { vst1q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s64(int64_t *a, int64x2_t b) { vst1q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(ptr noundef %a, <8 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 -// CHECK: store half [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[B]], i64 7 +// CHECK-NEXT: store half [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f16(float16_t *a, float16x8_t b) { vst1q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(ptr noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -// CHECK: store float [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[B]], i64 3 +// CHECK-NEXT: store float [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f32(float32_t *a, float32x4_t b) { vst1q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(ptr noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double> -// CHECK: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 -// CHECK: store double [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[B]], i64 1 +// CHECK-NEXT: store double [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f64(float64_t *a, float64x2_t b) { vst1q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(ptr noundef %a, <16 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p8(poly8_t *a, poly8x16_t b) { vst1q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(ptr noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p16(poly16_t *a, poly16x8_t b) { vst1q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(ptr noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[B]], i64 1 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p64(poly64_t *a, poly64x2_t b) { vst1q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_u8(uint8_t *a, uint8x8_t b) { vst1_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_u16(uint16_t *a, uint16x4_t b) { vst1_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u32(uint32_t *a, uint32x2_t b) { vst1_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_u64(uint64_t *a, uint64x1_t b) { vst1_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_s8(int8_t *a, int8x8_t b) { vst1_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_s16(int16_t *a, int16x4_t b) { vst1_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(ptr noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s32(int32_t *a, int32x2_t b) { vst1_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_s64(int64_t *a, int64x1_t b) { vst1_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(ptr noundef %a, <4 x half> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 -// CHECK: store half [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[B]], i64 3 +// CHECK-NEXT: store half [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_f16(float16_t *a, float16x4_t b) { vst1_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(ptr noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -// CHECK: store float [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[B]], i64 1 +// CHECK-NEXT: store float [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_f32(float32_t *a, float32x2_t b) { vst1_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(ptr noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0 -// CHECK: store double [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: store double [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_f64(float64_t *a, float64x1_t b) { vst1_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(ptr noundef %a, <8 x i8> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_p8(poly8_t *a, poly8x8_t b) { vst1_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(ptr noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_p16(poly16_t *a, poly16x4_t b) { vst1_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(ptr noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_lane_p64(poly64_t *a, poly64x1_t b) { vst1_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u8(uint8_t *a, uint8x16x2_t b) { vst2q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u16(uint16_t *a, uint16x8x2_t b) { vst2q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u32(uint32_t *a, uint32x4x2_t b) { vst2q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u64(uint64_t *a, uint64x2x2_t b) { vst2q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s8(int8_t *a, int8x16x2_t b) { vst2q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s16(int16_t *a, int16x8x2_t b) { vst2q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(ptr noundef %a, [2 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s32(int32_t *a, int32x4x2_t b) { vst2q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s64(int64_t *a, int64x2x2_t b) { vst2q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(ptr noundef %a, [2 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f16(float16_t *a, float16x8x2_t b) { vst2q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(ptr noundef %a, [2 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f32(float32_t *a, float32x4x2_t b) { vst2q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(ptr noundef %a, [2 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f64(float64_t *a, float64x2x2_t b) { vst2q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(ptr noundef %a, [2 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p8(poly8_t *a, poly8x16x2_t b) { vst2q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(ptr noundef %a, [2 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p16(poly16_t *a, poly16x8x2_t b) { vst2q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(ptr noundef %a, [2 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p64(poly64_t *a, poly64x2x2_t b) { vst2q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u8(uint8_t *a, uint8x8x2_t b) { vst2_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u16(uint16_t *a, uint16x4x2_t b) { vst2_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u32(uint32_t *a, uint32x2x2_t b) { vst2_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_u64(uint64_t *a, uint64x1x2_t b) { vst2_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s8(int8_t *a, int8x8x2_t b) { vst2_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s16(int16_t *a, int16x4x2_t b) { vst2_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(ptr noundef %a, [2 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s32(int32_t *a, int32x2x2_t b) { vst2_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_s64(int64_t *a, int64x1x2_t b) { vst2_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(ptr noundef %a, [2 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f16(float16_t *a, float16x4x2_t b) { vst2_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(ptr noundef %a, [2 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f32(float32_t *a, float32x2x2_t b) { vst2_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(ptr noundef %a, [2 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_f64(float64_t *a, float64x1x2_t b) { vst2_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(ptr noundef %a, [2 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p8(poly8_t *a, poly8x8x2_t b) { vst2_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(ptr noundef %a, [2 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p16(poly16_t *a, poly16x4x2_t b) { vst2_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(ptr noundef %a, [2 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[B]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst2_lane_p64(poly64_t *a, poly64x1x2_t b) { vst2_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u8(uint8_t *a, uint8x16x3_t b) { vst3q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u16(uint16_t *a, uint16x8x3_t b) { vst3q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u32(uint32_t *a, uint32x4x3_t b) { vst3q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u64(uint64_t *a, uint64x2x3_t b) { vst3q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s8(int8_t *a, int8x16x3_t b) { vst3q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s16(int16_t *a, int16x8x3_t b) { vst3q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(ptr noundef %a, [3 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s32(int32_t *a, int32x4x3_t b) { vst3q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s64(int64_t *a, int64x2x3_t b) { vst3q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(ptr noundef %a, [3 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f16(float16_t *a, float16x8x3_t b) { vst3q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(ptr noundef %a, [3 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f32(float32_t *a, float32x4x3_t b) { vst3q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(ptr noundef %a, [3 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f64(float64_t *a, float64x2x3_t b) { vst3q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(ptr noundef %a, [3 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p8(poly8_t *a, poly8x16x3_t b) { vst3q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(ptr noundef %a, [3 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p16(poly16_t *a, poly16x8x3_t b) { vst3q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(ptr noundef %a, [3 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[B]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p64(poly64_t *a, poly64x2x3_t b) { vst3q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u8(uint8_t *a, uint8x8x3_t b) { vst3_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u16(uint16_t *a, uint16x4x3_t b) { vst3_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u32(uint32_t *a, uint32x2x3_t b) { vst3_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_u64(uint64_t *a, uint64x1x3_t b) { vst3_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s8(int8_t *a, int8x8x3_t b) { vst3_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s16(int16_t *a, int16x4x3_t b) { vst3_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(ptr noundef %a, [3 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s32(int32_t *a, int32x2x3_t b) { vst3_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_s64(int64_t *a, int64x1x3_t b) { vst3_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(ptr noundef %a, [3 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f16(float16_t *a, float16x4x3_t b) { vst3_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(ptr noundef %a, [3 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f32(float32_t *a, float32x2x3_t b) { vst3_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(ptr noundef %a, [3 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_f64(float64_t *a, float64x1x3_t b) { vst3_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(ptr noundef %a, [3 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p8(poly8_t *a, poly8x8x3_t b) { vst3_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(ptr noundef %a, [3 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p16(poly16_t *a, poly16x4x3_t b) { vst3_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(ptr noundef %a, [3 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[B]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst3_lane_p64(poly64_t *a, poly64x1x3_t b) { vst3_lane_p64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u8(uint8_t *a, uint8x16x4_t b) { vst4q_lane_u8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u16(uint16_t *a, uint16x8x4_t b) { vst4q_lane_u16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u32(uint32_t *a, uint32x4x4_t b) { vst4q_lane_u32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u64(uint64_t *a, uint64x2x4_t b) { vst4q_lane_u64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s8(int8_t *a, int8x16x4_t b) { vst4q_lane_s8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s16(int16_t *a, int16x8x4_t b) { vst4q_lane_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(ptr noundef %a, [4 x <4 x i32>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i32>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i32.p0(<4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s32(int32_t *a, int32x4x4_t b) { vst4q_lane_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s64(int64_t *a, int64x2x4_t b) { vst4q_lane_s64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(ptr noundef %a, [4 x <8 x half>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x half>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8f16.p0(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f16(float16_t *a, float16x8x4_t b) { vst4q_lane_f16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(ptr noundef %a, [4 x <4 x float>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x float>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4f32.p0(<4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f32(float32_t *a, float32x4x4_t b) { vst4q_lane_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(ptr noundef %a, [4 x <2 x double>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x double>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2f64.p0(<2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f64(float64_t *a, float64x2x4_t b) { vst4q_lane_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(ptr noundef %a, [4 x <16 x i8>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <16 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <16 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v16i8.p0(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p8(poly8_t *a, poly8x16x4_t b) { vst4q_lane_p8(a, b, 15); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(ptr noundef %a, [4 x <8 x i16>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i16>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i16.p0(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p16(poly16_t *a, poly16x8x4_t b) { vst4q_lane_p16(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(ptr noundef %a, [4 x <2 x i64>] alignstack(16) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[B]], i64 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i64>] alignstack(16) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT]], ptr [[B]], align 16 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 16 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 32 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 16 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 48 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[B]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX2]], align 16 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX4]], align 16 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX6]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p64(poly64_t *a, poly64x2x4_t b) { vst4q_lane_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u8(uint8_t *a, uint8x8x4_t b) { vst4_lane_u8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u16(uint16_t *a, uint16x4x4_t b) { vst4_lane_u16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u32(uint32_t *a, uint32x2x4_t b) { vst4_lane_u32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_u64(uint64_t *a, uint64x1x4_t b) { vst4_lane_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s8(int8_t *a, int8x8x4_t b) { vst4_lane_s8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s16(int16_t *a, int16x4x4_t b) { vst4_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(ptr noundef %a, [4 x <2 x i32>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x i32>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x i32>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i32>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x i32> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2i32.p0(<2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s32(int32_t *a, int32x2x4_t b) { vst4_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_s64(int64_t *a, int64x1x4_t b) { vst4_lane_s64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(ptr noundef %a, [4 x <4 x half>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x half>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x half>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x half>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x half> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4f16.p0(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f16(float16_t *a, float16x4x4_t b) { vst4_lane_f16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(ptr noundef %a, [4 x <2 x float>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <2 x float>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <2 x float>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 0 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 1 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 2 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x float>] [[B_COERCE]], 3 +// CHECK-NEXT: store <2 x float> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v2f32.p0(<2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i64 1, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f32(float32_t *a, float32x2x4_t b) { vst4_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(ptr noundef %a, [4 x <1 x double>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x double>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_f64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x double>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x double>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x double> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x double>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x double>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double> [[TMP0]], <1 x double> [[TMP1]], <1 x double> [[TMP2]], <1 x double> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_f64(float64_t *a, float64x1x4_t b) { vst4_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(ptr noundef %a, [4 x <8 x i8>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <8 x i8>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: store <8 x i8> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v8i8.p0(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p8(poly8_t *a, poly8x8x4_t b) { vst4_lane_p8(a, b, 7); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(ptr noundef %a, [4 x <4 x i16>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <4 x i16>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <4 x i16>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 0 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 1 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 2 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <4 x i16>] [[B_COERCE]], 3 +// CHECK-NEXT: store <4 x i16> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v4i16.p0(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i64 3, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p16(poly16_t *a, poly16x4x4_t b) { vst4_lane_p16(a, b, 3); } -// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(ptr noundef %a, [4 x <1 x i64>] alignstack(8) %b.coerce) #0 { -// CHECK: [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[B]], i64 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, ptr %a) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_lane_p64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x <1 x i64>] alignstack(8) [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[B_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT]], ptr [[B]], align 8 +// CHECK-NEXT: [[B_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 8 +// CHECK-NEXT: [[B_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT2]], ptr [[B_REPACK1]], align 8 +// CHECK-NEXT: [[B_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 16 +// CHECK-NEXT: [[B_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT4]], ptr [[B_REPACK3]], align 8 +// CHECK-NEXT: [[B_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 24 +// CHECK-NEXT: [[B_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[B_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[B_COERCE_ELT6]], ptr [[B_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[B]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 +// CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 +// CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i64 0, ptr [[A]]) +// CHECK-NEXT: ret void +// void test_vst4_lane_p64(poly64_t *a, poly64x1x4_t b) { vst4_lane_p64(a, b, 0); } diff --git a/clang/test/CodeGen/AArch64/neon-misc-constrained.c b/clang/test/CodeGen/AArch64/neon-misc-constrained.c index e24e129d2bc7d..be144fcdc31c5 100644 --- a/clang/test/CodeGen/AArch64/neon-misc-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-misc-constrained.c @@ -1,17 +1,11 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -ffp-exception-behavior=strict \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -S -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -S -o - %s \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck --check-prefix=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -19,42 +13,66 @@ #include -// COMMON-LABEL: test_vrndaq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) -// CONSTRAINED: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %a, metadata !"fpexcept.strict") -// CHECK-ASM: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VRNDA1_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VRNDA1_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret <2 x double> [[VRNDA1_I]] +// float64x2_t test_vrndaq_f64(float64x2_t a) { return vrndaq_f64(a); } -// COMMON-LABEL: test_vrndpq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) -// CONSTRAINED: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %a, metadata !"fpexcept.strict") -// CHECK-ASM: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VRNDP1_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VRNDP1_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> [[A]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <2 x double> [[VRNDP1_I]] +// float64x2_t test_vrndpq_f64(float64x2_t a) { return vrndpq_f64(a); } -// COMMON-LABEL: test_vsqrtq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s -// COMMONIR: ret <4 x float> [[VSQRT_I]] +// UNCONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// UNCONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x float> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// CONSTRAINED-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x float> [[VSQRT_I]] +// float32x4_t test_vsqrtq_f32(float32x4_t a) { return vsqrtq_f32(a); } -// COMMON-LABEL: test_vsqrtq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// UNCONSTRAINED: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) -// CONSTRAINED: [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d -// COMMONIR: ret <2 x double> [[VSQRT_I]] +// UNCONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// UNCONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <2 x double> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// CONSTRAINED-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <2 x double> [[VSQRT_I]] +// float64x2_t test_vsqrtq_f64(float64x2_t a) { return vsqrtq_f64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-misc.c b/clang/test/CodeGen/AArch64/neon-misc.c index 165f33a9f399f..c840e4753ab6b 100644 --- a/clang/test/CodeGen/AArch64/neon-misc.c +++ b/clang/test/CodeGen/AArch64/neon-misc.c @@ -1,2718 +1,3416 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vceqz_s8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_s8(int8x8_t a) { return vceqz_s8(a); } -// CHECK-LABEL: @test_vceqz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_s16(int16x4_t a) { return vceqz_s16(a); } -// CHECK-LABEL: @test_vceqz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_s32(int32x2_t a) { return vceqz_s32(a); } -// CHECK-LABEL: @test_vceqz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_s64(int64x1_t a) { return vceqz_s64(a); } -// CHECK-LABEL: @test_vceqz_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_u64(uint64x1_t a) { return vceqz_u64(a); } -// CHECK-LABEL: @test_vceqz_p64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <1 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <1 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_p64(poly64x1_t a) { return vceqz_p64(a); } -// CHECK-LABEL: @test_vceqzq_s8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_s8(int8x16_t a) { return vceqzq_s8(a); } -// CHECK-LABEL: @test_vceqzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_s16(int16x8_t a) { return vceqzq_s16(a); } -// CHECK-LABEL: @test_vceqzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_s32(int32x4_t a) { return vceqzq_s32(a); } -// CHECK-LABEL: @test_vceqzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_s64(int64x2_t a) { return vceqzq_s64(a); } -// CHECK-LABEL: @test_vceqz_u8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_u8(uint8x8_t a) { return vceqz_u8(a); } -// CHECK-LABEL: @test_vceqz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_u16(uint16x4_t a) { return vceqz_u16(a); } -// CHECK-LABEL: @test_vceqz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_u32(uint32x2_t a) { return vceqz_u32(a); } -// CHECK-LABEL: @test_vceqzq_u8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_u8(uint8x16_t a) { return vceqzq_u8(a); } -// CHECK-LABEL: @test_vceqzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <8 x i16> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_u16(uint16x8_t a) { return vceqzq_u16(a); } -// CHECK-LABEL: @test_vceqzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <4 x i32> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_u32(uint32x4_t a) { return vceqzq_u32(a); } -// CHECK-LABEL: @test_vceqzq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_u64(uint64x2_t a) { return vceqzq_u64(a); } -// CHECK-LABEL: @test_vceqz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <2 x float> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vceqz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCEQZ_I]] +// uint32x2_t test_vceqz_f32(float32x2_t a) { return vceqz_f32(a); } -// CHECK-LABEL: @test_vceqz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <1 x double> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceqz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <1 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCEQZ_I]] +// uint64x1_t test_vceqz_f64(float64x1_t a) { return vceqz_f64(a); } -// CHECK-LABEL: @test_vceqzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x float> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vceqzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <4 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCEQZ_I]] +// uint32x4_t test_vceqzq_f32(float32x4_t a) { return vceqzq_f32(a); } -// CHECK-LABEL: @test_vceqz_p8( -// CHECK: [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vceqz_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCEQZ_I]] +// uint8x8_t test_vceqz_p8(poly8x8_t a) { return vceqz_p8(a); } -// CHECK-LABEL: @test_vceqzq_p8( -// CHECK: [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vceqzq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCEQZ_I]] +// uint8x16_t test_vceqzq_p8(poly8x16_t a) { return vceqzq_p8(a); } -// CHECK-LABEL: @test_vceqzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oeq <2 x double> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <2 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_f64(float64x2_t a) { return vceqzq_f64(a); } -// CHECK-LABEL: @test_vceqzq_p64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp eq <2 x i64> %a, zeroinitializer -// CHECK: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCEQZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqzq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp eq <2 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCEQZ_I]] +// uint64x2_t test_vceqzq_p64(poly64x2_t a) { return vceqzq_p64(a); } -// CHECK-LABEL: @test_vcgez_s8( -// CHECK: [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgez_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCGEZ_I]] +// uint8x8_t test_vcgez_s8(int8x8_t a) { return vcgez_s8(a); } -// CHECK-LABEL: @test_vcgez_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <4 x i16> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] +// uint16x4_t test_vcgez_s16(int16x4_t a) { return vcgez_s16(a); } -// CHECK-LABEL: @test_vcgez_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <2 x i32> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGEZ_I]] +// uint32x2_t test_vcgez_s32(int32x2_t a) { return vcgez_s32(a); } -// CHECK-LABEL: @test_vcgez_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <1 x i64> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <1 x i64> [[A]], splat (i64 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGEZ_I]] +// uint64x1_t test_vcgez_s64(int64x1_t a) { return vcgez_s64(a); } -// CHECK-LABEL: @test_vcgezq_s8( -// CHECK: [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgezq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCGEZ_I]] +// uint8x16_t test_vcgezq_s8(int8x16_t a) { return vcgezq_s8(a); } -// CHECK-LABEL: @test_vcgezq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <8 x i16> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] +// uint16x8_t test_vcgezq_s16(int16x8_t a) { return vcgezq_s16(a); } -// CHECK-LABEL: @test_vcgezq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <4 x i32> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGEZ_I]] +// uint32x4_t test_vcgezq_s32(int32x4_t a) { return vcgezq_s32(a); } -// CHECK-LABEL: @test_vcgezq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sge <2 x i64> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <2 x i64> [[A]], splat (i64 -1) +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGEZ_I]] +// uint64x2_t test_vcgezq_s64(int64x2_t a) { return vcgezq_s64(a); } -// CHECK-LABEL: @test_vcgez_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <2 x float> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgez_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <2 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGEZ_I]] +// uint32x2_t test_vcgez_f32(float32x2_t a) { return vcgez_f32(a); } -// CHECK-LABEL: @test_vcgez_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <1 x double> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgez_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <1 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGEZ_I]] +// uint64x1_t test_vcgez_f64(float64x1_t a) { return vcgez_f64(a); } -// CHECK-LABEL: @test_vcgezq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x float> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgezq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <4 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGEZ_I]] +// uint32x4_t test_vcgezq_f32(float32x4_t a) { return vcgezq_f32(a); } -// CHECK-LABEL: @test_vcgezq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp oge <2 x double> %a, zeroinitializer -// CHECK: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgezq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <2 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGEZ_I]] +// uint64x2_t test_vcgezq_f64(float64x2_t a) { return vcgezq_f64(a); } -// CHECK-LABEL: @test_vclez_s8( -// CHECK: [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclez_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCLEZ_I]] +// uint8x8_t test_vclez_s8(int8x8_t a) { return vclez_s8(a); } -// CHECK-LABEL: @test_vclez_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <4 x i16> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] +// uint16x4_t test_vclez_s16(int16x4_t a) { return vclez_s16(a); } -// CHECK-LABEL: @test_vclez_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <2 x i32> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <2 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLEZ_I]] +// uint32x2_t test_vclez_s32(int32x2_t a) { return vclez_s32(a); } -// CHECK-LABEL: @test_vclez_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <1 x i64> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLEZ_I]] +// uint64x1_t test_vclez_s64(int64x1_t a) { return vclez_s64(a); } -// CHECK-LABEL: @test_vclezq_s8( -// CHECK: [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclezq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCLEZ_I]] +// uint8x16_t test_vclezq_s8(int8x16_t a) { return vclezq_s8(a); } -// CHECK-LABEL: @test_vclezq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <8 x i16> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] +// uint16x8_t test_vclezq_s16(int16x8_t a) { return vclezq_s16(a); } -// CHECK-LABEL: @test_vclezq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <4 x i32> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLEZ_I]] +// uint32x4_t test_vclezq_s32(int32x4_t a) { return vclezq_s32(a); } -// CHECK-LABEL: @test_vclezq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sle <2 x i64> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp slt <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLEZ_I]] +// uint64x2_t test_vclezq_s64(int64x2_t a) { return vclezq_s64(a); } -// CHECK-LABEL: @test_vclez_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <2 x float> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclez_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLEZ_I]] +// uint32x2_t test_vclez_f32(float32x2_t a) { return vclez_f32(a); } -// CHECK-LABEL: @test_vclez_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <1 x double> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vclez_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <1 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLEZ_I]] +// uint64x1_t test_vclez_f64(float64x1_t a) { return vclez_f64(a); } -// CHECK-LABEL: @test_vclezq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x float> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclezq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <4 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLEZ_I]] +// uint32x4_t test_vclezq_f32(float32x4_t a) { return vclezq_f32(a); } -// CHECK-LABEL: @test_vclezq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ole <2 x double> %a, zeroinitializer -// CHECK: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLEZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vclezq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLEZ_I]] +// uint64x2_t test_vclezq_f64(float64x2_t a) { return vclezq_f64(a); } -// CHECK-LABEL: @test_vcgtz_s8( -// CHECK: [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcgtz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VCGTZ_I]] +// uint8x8_t test_vcgtz_s8(int8x8_t a) { return vcgtz_s8(a); } -// CHECK-LABEL: @test_vcgtz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <4 x i16> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] +// uint16x4_t test_vcgtz_s16(int16x4_t a) { return vcgtz_s16(a); } -// CHECK-LABEL: @test_vcgtz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <2 x i32> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <2 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGTZ_I]] +// uint32x2_t test_vcgtz_s32(int32x2_t a) { return vcgtz_s32(a); } -// CHECK-LABEL: @test_vcgtz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <1 x i64> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <1 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGTZ_I]] +// uint64x1_t test_vcgtz_s64(int64x1_t a) { return vcgtz_s64(a); } -// CHECK-LABEL: @test_vcgtzq_s8( -// CHECK: [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcgtzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <16 x i8> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VCGTZ_I]] +// uint8x16_t test_vcgtzq_s8(int8x16_t a) { return vcgtzq_s8(a); } -// CHECK-LABEL: @test_vcgtzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <8 x i16> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <8 x i16> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] +// uint16x8_t test_vcgtzq_s16(int16x8_t a) { return vcgtzq_s16(a); } -// CHECK-LABEL: @test_vcgtzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <4 x i32> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i32> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGTZ_I]] +// uint32x4_t test_vcgtzq_s32(int32x4_t a) { return vcgtzq_s32(a); } -// CHECK-LABEL: @test_vcgtzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp sgt <2 x i64> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <2 x i64> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGTZ_I]] +// uint64x2_t test_vcgtzq_s64(int64x2_t a) { return vcgtzq_s64(a); } -// CHECK-LABEL: @test_vcgtz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <2 x float> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcgtz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <2 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCGTZ_I]] +// uint32x2_t test_vcgtz_f32(float32x2_t a) { return vcgtz_f32(a); } -// CHECK-LABEL: @test_vcgtz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <1 x double> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcgtz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <1 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCGTZ_I]] +// uint64x1_t test_vcgtz_f64(float64x1_t a) { return vcgtz_f64(a); } -// CHECK-LABEL: @test_vcgtzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x float> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcgtzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <4 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCGTZ_I]] +// uint32x4_t test_vcgtzq_f32(float32x4_t a) { return vcgtzq_f32(a); } -// CHECK-LABEL: @test_vcgtzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp ogt <2 x double> %a, zeroinitializer -// CHECK: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCGTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcgtzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <2 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCGTZ_I]] +// uint64x2_t test_vcgtzq_f64(float64x2_t a) { return vcgtzq_f64(a); } -// CHECK-LABEL: @test_vcltz_s8( -// CHECK: [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> -// CHECK: ret <8 x i8> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcltz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <8 x i8> [[A]], splat (i8 7) +// CHECK-NEXT: ret <8 x i8> [[A_LOBIT]] +// uint8x8_t test_vcltz_s8(int8x8_t a) { return vcltz_s8(a); } -// CHECK-LABEL: @test_vcltz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <4 x i16> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <4 x i16> [[A]], splat (i16 15) +// CHECK-NEXT: ret <4 x i16> [[A_LOBIT]] +// uint16x4_t test_vcltz_s16(int16x4_t a) { return vcltz_s16(a); } -// CHECK-LABEL: @test_vcltz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <2 x i32> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <2 x i32> [[A]], splat (i32 31) +// CHECK-NEXT: ret <2 x i32> [[A_LOBIT]] +// uint32x2_t test_vcltz_s32(int32x2_t a) { return vcltz_s32(a); } -// CHECK-LABEL: @test_vcltz_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <1 x i64> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <1 x i64> [[A]], splat (i64 63) +// CHECK-NEXT: ret <1 x i64> [[A_LOBIT]] +// uint64x1_t test_vcltz_s64(int64x1_t a) { return vcltz_s64(a); } -// CHECK-LABEL: @test_vcltzq_s8( -// CHECK: [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8> -// CHECK: ret <16 x i8> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcltzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <16 x i8> [[A]], splat (i8 7) +// CHECK-NEXT: ret <16 x i8> [[A_LOBIT]] +// uint8x16_t test_vcltzq_s8(int8x16_t a) { return vcltzq_s8(a); } -// CHECK-LABEL: @test_vcltzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <8 x i16> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> -// CHECK: ret <8 x i16> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <8 x i16> [[A]], splat (i16 15) +// CHECK-NEXT: ret <8 x i16> [[A_LOBIT]] +// uint16x8_t test_vcltzq_s16(int16x8_t a) { return vcltzq_s16(a); } -// CHECK-LABEL: @test_vcltzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <4 x i32> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <4 x i32> [[A]], splat (i32 31) +// CHECK-NEXT: ret <4 x i32> [[A_LOBIT]] +// uint32x4_t test_vcltzq_s32(int32x4_t a) { return vcltzq_s32(a); } -// CHECK-LABEL: @test_vcltzq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = icmp slt <2 x i64> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_LOBIT:%.*]] = ashr <2 x i64> [[A]], splat (i64 63) +// CHECK-NEXT: ret <2 x i64> [[A_LOBIT]] +// uint64x2_t test_vcltzq_s64(int64x2_t a) { return vcltzq_s64(a); } -// CHECK-LABEL: @test_vcltz_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <2 x float> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> -// CHECK: ret <2 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcltz_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <2 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCLTZ_I]] +// uint32x2_t test_vcltz_f32(float32x2_t a) { return vcltz_f32(a); } -// CHECK-LABEL: @test_vcltz_f64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <1 x double> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> -// CHECK: ret <1 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcltz_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <1 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP0]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VCLTZ_I]] +// uint64x1_t test_vcltz_f64(float64x1_t a) { return vcltz_f64(a); } -// CHECK-LABEL: @test_vcltzq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x float> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> -// CHECK: ret <4 x i32> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcltzq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCLTZ_I]] +// uint32x4_t test_vcltzq_f32(float32x4_t a) { return vcltzq_f32(a); } -// CHECK-LABEL: @test_vcltzq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = fcmp olt <2 x double> %a, zeroinitializer -// CHECK: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> -// CHECK: ret <2 x i64> [[VCLTZ_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcltzq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <2 x double> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP0]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VCLTZ_I]] +// uint64x2_t test_vcltzq_f64(float64x2_t a) { return vcltzq_f64(a); } -// CHECK-LABEL: @test_vrev16_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: @test_vrev16_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: @test_vrev16_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: @test_vrev16q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: @test_vrev16q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: @test_vrev16q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: @test_vrev32_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: @test_vrev32_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: @test_vrev32_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: @test_vrev32_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: @test_vrev32_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: @test_vrev32_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: @test_vrev32q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: @test_vrev32q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: @test_vrev32q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: @test_vrev32q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: @test_vrev32q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: @test_vrev32q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev32q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: @test_vrev64_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: @test_vrev64_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: @test_vrev64_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: @test_vrev64_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: @test_vrev64_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: @test_vrev64_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrev64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: @test_vrev64_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: @test_vrev64_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vrev64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: @test_vrev64_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrev64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: @test_vrev64q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: @test_vrev64q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: @test_vrev64q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: @test_vrev64q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: @test_vrev64q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: @test_vrev64q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrev64q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: @test_vrev64q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: @test_vrev64q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vrev64q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: @test_vrev64q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrev64q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } -// CHECK-LABEL: @test_vpaddl_s8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// int16x4_t test_vpaddl_s8(int8x8_t a) { return vpaddl_s8(a); } -// CHECK-LABEL: @test_vpaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// int32x2_t test_vpaddl_s16(int16x4_t a) { return vpaddl_s16(a); } -// CHECK-LABEL: @test_vpaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// int64x1_t test_vpaddl_s32(int32x2_t a) { return vpaddl_s32(a); } -// CHECK-LABEL: @test_vpaddl_u8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// uint16x4_t test_vpaddl_u8(uint8x8_t a) { return vpaddl_u8(a); } -// CHECK-LABEL: @test_vpaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// uint32x2_t test_vpaddl_u16(uint16x4_t a) { return vpaddl_u16(a); } -// CHECK-LABEL: @test_vpaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// uint64x1_t test_vpaddl_u32(uint32x2_t a) { return vpaddl_u32(a); } -// CHECK-LABEL: @test_vpaddlq_s8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// int16x8_t test_vpaddlq_s8(int8x16_t a) { return vpaddlq_s8(a); } -// CHECK-LABEL: @test_vpaddlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// int32x4_t test_vpaddlq_s16(int16x8_t a) { return vpaddlq_s16(a); } -// CHECK-LABEL: @test_vpaddlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// int64x2_t test_vpaddlq_s32(int32x4_t a) { return vpaddlq_s32(a); } -// CHECK-LABEL: @test_vpaddlq_u8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpaddlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// uint16x8_t test_vpaddlq_u8(uint8x16_t a) { return vpaddlq_u8(a); } -// CHECK-LABEL: @test_vpaddlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpaddlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// uint32x4_t test_vpaddlq_u16(uint16x8_t a) { return vpaddlq_u16(a); } -// CHECK-LABEL: @test_vpaddlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpaddlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// uint64x2_t test_vpaddlq_u32(uint32x4_t a) { return vpaddlq_u32(a); } -// CHECK-LABEL: @test_vpadal_s8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_s8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[VPADAL_I]], [[A]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); } -// CHECK-LABEL: @test_vpadal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_s16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); } -// CHECK-LABEL: @test_vpadal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_s32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); } -// CHECK-LABEL: @test_vpadal_u8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], %a -// CHECK: ret <4 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vpadal_u8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[VPADAL_I]], [[A]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); } -// CHECK-LABEL: @test_vpadal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], %a -// CHECK: ret <2 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vpadal_u16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); } -// CHECK-LABEL: @test_vpadal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], %a -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vpadal_u32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); } -// CHECK-LABEL: @test_vpadalq_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a -// CHECK: ret <8 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[VPADAL_I]], [[A]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); } -// CHECK-LABEL: @test_vpadalq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); } -// CHECK-LABEL: @test_vpadalq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); } -// CHECK-LABEL: @test_vpadalq_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) -// CHECK: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], %a -// CHECK: ret <8 x i16> [[TMP1]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vpadalq_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[VPADAL_I]], [[A]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); } -// CHECK-LABEL: @test_vpadalq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %b) -// CHECK: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], %a -// CHECK: ret <4 x i32> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vpadalq_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); } -// CHECK-LABEL: @test_vpadalq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %b) -// CHECK: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], %a -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vpadalq_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[VPADAL1_I]], [[A]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); } -// CHECK-LABEL: @test_vqabs_s8( -// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQABS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQABS_V_I]] +// int8x8_t test_vqabs_s8(int8x8_t a) { return vqabs_s8(a); } -// CHECK-LABEL: @test_vqabsq_s8( -// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQABSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQABSQ_V_I]] +// int8x16_t test_vqabsq_s8(int8x16_t a) { return vqabsq_s8(a); } -// CHECK-LABEL: @test_vqabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQABS_V1_I]] +// int16x4_t test_vqabs_s16(int16x4_t a) { return vqabs_s16(a); } -// CHECK-LABEL: @test_vqabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VQABSQ_V1_I]] +// int16x8_t test_vqabsq_s16(int16x8_t a) { return vqabsq_s16(a); } -// CHECK-LABEL: @test_vqabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQABS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQABS_V1_I]] +// int32x2_t test_vqabs_s32(int32x2_t a) { return vqabs_s32(a); } -// CHECK-LABEL: @test_vqabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VQABSQ_V1_I]] +// int32x4_t test_vqabsq_s32(int32x4_t a) { return vqabsq_s32(a); } -// CHECK-LABEL: @test_vqabsq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQABSQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqabsq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VQABSQ_V1_I]] +// int64x2_t test_vqabsq_s64(int64x2_t a) { return vqabsq_s64(a); } -// CHECK-LABEL: @test_vqneg_s8( -// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQNEG_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQNEG_V_I]] +// int8x8_t test_vqneg_s8(int8x8_t a) { return vqneg_s8(a); } -// CHECK-LABEL: @test_vqnegq_s8( -// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQNEGQ_V_I]] +// int8x16_t test_vqnegq_s8(int8x16_t a) { return vqnegq_s8(a); } -// CHECK-LABEL: @test_vqneg_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQNEG_V1_I]] +// int16x4_t test_vqneg_s16(int16x4_t a) { return vqneg_s16(a); } -// CHECK-LABEL: @test_vqnegq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VQNEGQ_V1_I]] +// int16x8_t test_vqnegq_s16(int16x8_t a) { return vqnegq_s16(a); } -// CHECK-LABEL: @test_vqneg_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQNEG_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQNEG_V1_I]] +// int32x2_t test_vqneg_s32(int32x2_t a) { return vqneg_s32(a); } -// CHECK-LABEL: @test_vqnegq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VQNEGQ_V1_I]] +// int32x4_t test_vqnegq_s32(int32x4_t a) { return vqnegq_s32(a); } -// CHECK-LABEL: @test_vqnegq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vqnegq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VQNEGQ_V1_I]] +// int64x2_t test_vqnegq_s64(int64x2_t a) { return vqnegq_s64(a); } -// CHECK-LABEL: @test_vneg_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vneg_s8(int8x8_t a) { return vneg_s8(a); } -// CHECK-LABEL: @test_vnegq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vnegq_s8(int8x16_t a) { return vnegq_s8(a); } -// CHECK-LABEL: @test_vneg_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vneg_s16(int16x4_t a) { return vneg_s16(a); } -// CHECK-LABEL: @test_vnegq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vnegq_s16(int16x8_t a) { return vnegq_s16(a); } -// CHECK-LABEL: @test_vneg_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vneg_s32(int32x2_t a) { return vneg_s32(a); } -// CHECK-LABEL: @test_vnegq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vnegq_s32(int32x4_t a) { return vnegq_s32(a); } -// CHECK-LABEL: @test_vnegq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vnegq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vnegq_s64(int64x2_t a) { return vnegq_s64(a); } -// CHECK-LABEL: @test_vneg_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %a -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vneg_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[A]] +// CHECK-NEXT: ret <2 x float> [[FNEG_I]] +// float32x2_t test_vneg_f32(float32x2_t a) { return vneg_f32(a); } -// CHECK-LABEL: @test_vnegq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %a -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vnegq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[A]] +// CHECK-NEXT: ret <4 x float> [[FNEG_I]] +// float32x4_t test_vnegq_f32(float32x4_t a) { return vnegq_f32(a); } -// CHECK-LABEL: @test_vnegq_f64( -// CHECK: [[SUB_I:%.*]] = fneg <2 x double> %a -// CHECK: ret <2 x double> [[SUB_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vnegq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x double> [[A]] +// CHECK-NEXT: ret <2 x double> [[FNEG_I]] +// float64x2_t test_vnegq_f64(float64x2_t a) { return vnegq_f64(a); } -// CHECK-LABEL: @test_vabs_s8( -// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VABS_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VABS_I]] +// int8x8_t test_vabs_s8(int8x8_t a) { return vabs_s8(a); } -// CHECK-LABEL: @test_vabsq_s8( -// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VABS_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VABS_I]] +// int8x16_t test_vabsq_s8(int8x16_t a) { return vabsq_s8(a); } -// CHECK-LABEL: @test_vabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %a) -// CHECK: ret <4 x i16> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VABS1_I]] +// int16x4_t test_vabs_s16(int16x4_t a) { return vabs_s16(a); } -// CHECK-LABEL: @test_vabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %a) -// CHECK: ret <8 x i16> [[VABS1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VABS1_I]] +// int16x8_t test_vabsq_s16(int16x8_t a) { return vabsq_s16(a); } -// CHECK-LABEL: @test_vabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VABS1_I]] +// int32x2_t test_vabs_s32(int32x2_t a) { return vabs_s32(a); } -// CHECK-LABEL: @test_vabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VABS1_I]] +// int32x4_t test_vabsq_s32(int32x4_t a) { return vabsq_s32(a); } -// CHECK-LABEL: @test_vabsq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> %a) -// CHECK: ret <2 x i64> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vabsq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VABS1_I]] +// int64x2_t test_vabsq_s64(int64x2_t a) { return vabsq_s64(a); } -// CHECK-LABEL: @test_vabs_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vabs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VABS1_I]] +// float32x2_t test_vabs_f32(float32x2_t a) { return vabs_f32(a); } -// CHECK-LABEL: @test_vabsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VABS1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vabsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VABS1_I]] +// float32x4_t test_vabsq_f32(float32x4_t a) { return vabsq_f32(a); } -// CHECK-LABEL: @test_vabsq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VABS1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vabsq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VABS1_I]] +// float64x2_t test_vabsq_f64(float64x2_t a) { return vabsq_f64(a); } -// CHECK-LABEL: @test_vuqadd_s8( -// CHECK: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VUQADD_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VUQADD_I]] +// int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) { return vuqadd_s8(a, b); } -// CHECK-LABEL: @test_vuqaddq_s8( -// CHECK: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VUQADD_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VUQADD_I]] +// int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) { return vuqaddq_s8(a, b); } -// CHECK-LABEL: @test_vuqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i16> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VUQADD2_I]] +// int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) { return vuqadd_s16(a, b); } -// CHECK-LABEL: @test_vuqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i16> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VUQADD2_I]] +// int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) { return vuqaddq_s16(a, b); } -// CHECK-LABEL: @test_vuqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i32> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUQADD2_I]] +// int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) { return vuqadd_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: ret <4 x i32> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUQADD2_I]] +// int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) { return vuqaddq_s32(a, b); } -// CHECK-LABEL: @test_vuqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: ret <2 x i64> [[VUQADD2_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VUQADD2_I]] +// int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) { return vuqaddq_s64(a, b); } -// CHECK-LABEL: @test_vcls_s8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_s8(int8x8_t a) { return vcls_s8(a); } -// CHECK-LABEL: @test_vcls_u8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_u8(uint8x8_t a) { return vcls_u8(a); } -// CHECK-LABEL: @test_vclsq_s8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_s8(int8x16_t a) { return vclsq_s8(a); } -// CHECK-LABEL: @test_vclsq_u8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_u8(uint8x16_t a) { return vclsq_u8(a); } -// CHECK-LABEL: @test_vcls_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCLS_V1_I]] +// int16x4_t test_vcls_s16(int16x4_t a) { return vcls_s16(a); } -// CHECK-LABEL: @test_vcls_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCLS_V1_I]] +// int16x4_t test_vcls_u16(uint16x4_t a) { return vcls_u16(a); } -// CHECK-LABEL: @test_vclsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCLSQ_V1_I]] +// int16x8_t test_vclsq_s16(int16x8_t a) { return vclsq_s16(a); } -// CHECK-LABEL: @test_vclsq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCLSQ_V1_I]] +// int16x8_t test_vclsq_u16(uint16x8_t a) { return vclsq_u16(a); } -// CHECK-LABEL: @test_vcls_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCLS_V1_I]] +// int32x2_t test_vcls_s32(int32x2_t a) { return vcls_s32(a); } -// CHECK-LABEL: @test_vcls_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCLS_V1_I]] +// int32x2_t test_vcls_u32(uint32x2_t a) { return vcls_u32(a); } -// CHECK-LABEL: @test_vclsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCLSQ_V1_I]] +// int32x4_t test_vclsq_s32(int32x4_t a) { return vclsq_s32(a); } -// CHECK-LABEL: @test_vclsq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCLSQ_V1_I]] +// int32x4_t test_vclsq_u32(uint32x4_t a) { return vclsq_u32(a); } -// CHECK-LABEL: @test_vclz_s8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vclz_s8(int8x8_t a) { return vclz_s8(a); } -// CHECK-LABEL: @test_vclzq_s8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// int8x16_t test_vclzq_s8(int8x16_t a) { return vclzq_s8(a); } -// CHECK-LABEL: @test_vclz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// int16x4_t test_vclz_s16(int16x4_t a) { return vclz_s16(a); } -// CHECK-LABEL: @test_vclzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i16> [[VCLZQ_V1_I]] +// int16x8_t test_vclzq_s16(int16x8_t a) { return vclzq_s16(a); } -// CHECK-LABEL: @test_vclz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// int32x2_t test_vclz_s32(int32x2_t a) { return vclz_s32(a); } -// CHECK-LABEL: @test_vclzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i32> [[VCLZQ_V1_I]] +// int32x4_t test_vclzq_s32(int32x4_t a) { return vclzq_s32(a); } -// CHECK-LABEL: @test_vclz_u8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vclz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// uint8x8_t test_vclz_u8(uint8x8_t a) { return vclz_u8(a); } -// CHECK-LABEL: @test_vclzq_u8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vclzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// uint8x16_t test_vclzq_u8(uint8x16_t a) { return vclzq_u8(a); } -// CHECK-LABEL: @test_vclz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// uint16x4_t test_vclz_u16(uint16x4_t a) { return vclz_u16(a); } -// CHECK-LABEL: @test_vclzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i16> [[VCLZQ_V1_I]] +// uint16x8_t test_vclzq_u16(uint16x8_t a) { return vclzq_u16(a); } -// CHECK-LABEL: @test_vclz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vclz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// uint32x2_t test_vclz_u32(uint32x2_t a) { return vclz_u32(a); } -// CHECK-LABEL: @test_vclzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vclzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i32> [[VCLZQ_V1_I]] +// uint32x4_t test_vclzq_u32(uint32x4_t a) { return vclzq_u32(a); } -// CHECK-LABEL: @test_vcnt_s8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// int8x8_t test_vcnt_s8(int8x8_t a) { return vcnt_s8(a); } -// CHECK-LABEL: @test_vcntq_s8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// int8x16_t test_vcntq_s8(int8x16_t a) { return vcntq_s8(a); } -// CHECK-LABEL: @test_vcnt_u8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// uint8x8_t test_vcnt_u8(uint8x8_t a) { return vcnt_u8(a); } -// CHECK-LABEL: @test_vcntq_u8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// uint8x16_t test_vcntq_u8(uint8x16_t a) { return vcntq_u8(a); } -// CHECK-LABEL: @test_vcnt_p8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vcnt_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcnt_p8(poly8x8_t a) { return vcnt_p8(a); } -// CHECK-LABEL: @test_vcntq_p8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vcntq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// poly8x16_t test_vcntq_p8(poly8x16_t a) { return vcntq_p8(a); } -// CHECK-LABEL: @test_vmvn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// int8x8_t test_vmvn_s8(int8x8_t a) { return vmvn_s8(a); } -// CHECK-LABEL: @test_vmvnq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// int8x16_t test_vmvnq_s8(int8x16_t a) { return vmvnq_s8(a); } -// CHECK-LABEL: @test_vmvn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// int16x4_t test_vmvn_s16(int16x4_t a) { return vmvn_s16(a); } -// CHECK-LABEL: @test_vmvnq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// int16x8_t test_vmvnq_s16(int16x8_t a) { return vmvnq_s16(a); } -// CHECK-LABEL: @test_vmvn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// int32x2_t test_vmvn_s32(int32x2_t a) { return vmvn_s32(a); } -// CHECK-LABEL: @test_vmvnq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// int32x4_t test_vmvnq_s32(int32x4_t a) { return vmvnq_s32(a); } -// CHECK-LABEL: @test_vmvn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// uint8x8_t test_vmvn_u8(uint8x8_t a) { return vmvn_u8(a); } -// CHECK-LABEL: @test_vmvnq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// uint8x16_t test_vmvnq_u8(uint8x16_t a) { return vmvnq_u8(a); } -// CHECK-LABEL: @test_vmvn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmvn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// uint16x4_t test_vmvn_u16(uint16x4_t a) { return vmvn_u16(a); } -// CHECK-LABEL: @test_vmvnq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmvnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// uint16x8_t test_vmvnq_u16(uint16x8_t a) { return vmvnq_u16(a); } -// CHECK-LABEL: @test_vmvn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmvn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// uint32x2_t test_vmvn_u32(uint32x2_t a) { return vmvn_u32(a); } -// CHECK-LABEL: @test_vmvnq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmvnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// uint32x4_t test_vmvnq_u32(uint32x4_t a) { return vmvnq_u32(a); } -// CHECK-LABEL: @test_vmvn_p8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmvn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// poly8x8_t test_vmvn_p8(poly8x8_t a) { return vmvn_p8(a); } -// CHECK-LABEL: @test_vmvnq_p8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmvnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// poly8x16_t test_vmvnq_p8(poly8x16_t a) { return vmvnq_p8(a); } -// CHECK-LABEL: @test_vrbit_s8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// int8x8_t test_vrbit_s8(int8x8_t a) { return vrbit_s8(a); } -// CHECK-LABEL: @test_vrbitq_s8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// int8x16_t test_vrbitq_s8(int8x16_t a) { return vrbitq_s8(a); } -// CHECK-LABEL: @test_vrbit_u8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// uint8x8_t test_vrbit_u8(uint8x8_t a) { return vrbit_u8(a); } -// CHECK-LABEL: @test_vrbitq_u8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// uint8x16_t test_vrbitq_u8(uint8x16_t a) { return vrbitq_u8(a); } -// CHECK-LABEL: @test_vrbit_p8( -// CHECK: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vrbit_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VRBIT_I]] +// poly8x8_t test_vrbit_p8(poly8x8_t a) { return vrbit_p8(a); } -// CHECK-LABEL: @test_vrbitq_p8( -// CHECK: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VRBIT_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vrbitq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRBIT_I:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VRBIT_I]] +// poly8x16_t test_vrbitq_p8(poly8x16_t a) { return vrbitq_p8(a); } -// CHECK-LABEL: @test_vmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// int8x8_t test_vmovn_s16(int16x8_t a) { return vmovn_s16(a); } -// CHECK-LABEL: @test_vmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// int16x4_t test_vmovn_s32(int32x4_t a) { return vmovn_s32(a); } -// CHECK-LABEL: @test_vmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// int32x2_t test_vmovn_s64(int64x2_t a) { return vmovn_s64(a); } -// CHECK-LABEL: @test_vmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// uint8x8_t test_vmovn_u16(uint16x8_t a) { return vmovn_u16(a); } -// CHECK-LABEL: @test_vmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// uint16x4_t test_vmovn_u32(uint32x4_t a) { return vmovn_u32(a); } -// CHECK-LABEL: @test_vmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// uint32x2_t test_vmovn_u64(uint64x2_t a) { return vmovn_u64(a); } -// CHECK-LABEL: @test_vmovn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) { return vmovn_high_s16(a, b); } -// CHECK-LABEL: @test_vmovn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[B]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) { return vmovn_high_s32(a, b); } -// CHECK-LABEL: @test_vmovn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[B]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) { return vmovn_high_s64(a, b); } -// CHECK-LABEL: @test_vmovn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <8 x i16> %b to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VMOVN_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) { return vmovn_high_u16(a, b); } -// CHECK-LABEL: @test_vmovn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <4 x i32> %b to <4 x i16> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vmovn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[B]] to <4 x i16> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VMOVN_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) { return vmovn_high_u32(a, b); } -// CHECK-LABEL: @test_vmovn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VMOVN_I_I:%.*]] = trunc <2 x i64> %b to <2 x i32> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmovn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[B]] to <2 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VMOVN_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) { return vmovn_high_u64(a, b); } -// CHECK-LABEL: @test_vqmovun_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovun_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVUN_V1_I]] +// int8x8_t test_vqmovun_s16(int16x8_t a) { return vqmovun_s16(a); } -// CHECK-LABEL: @test_vqmovun_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovun_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVUN_V1_I]] +// int16x4_t test_vqmovun_s32(int32x4_t a) { return vqmovun_s32(a); } -// CHECK-LABEL: @test_vqmovun_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovun_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVUN_V1_I]] +// int32x2_t test_vqmovun_s64(int64x2_t a) { return vqmovun_s64(a); } -// CHECK-LABEL: @test_vqmovun_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovun_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqmovun_high_s16(uint8x8_t a, int16x8_t b) { return vqmovun_high_s16(a, b); } -// CHECK-LABEL: @test_vqmovun_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovun_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVUN_V1_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqmovun_high_s32(uint16x4_t a, int32x4_t b) { return vqmovun_high_s32(a, b); } -// CHECK-LABEL: @test_vqmovun_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovun_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVUN_V1_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqmovun_high_s64(uint32x2_t a, int64x2_t b) { return vqmovun_high_s64(a, b); } -// CHECK-LABEL: @test_vqmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// int8x8_t test_vqmovn_s16(int16x8_t a) { return vqmovn_s16(a); } -// CHECK-LABEL: @test_vqmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVN_V1_I]] +// int16x4_t test_vqmovn_s32(int32x4_t a) { return vqmovn_s32(a); } -// CHECK-LABEL: @test_vqmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVN_V1_I]] +// int32x2_t test_vqmovn_s64(int64x2_t a) { return vqmovn_s64(a); } -// CHECK-LABEL: @test_vqmovn_high_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_s16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) { return vqmovn_high_s16(a, b); } -// CHECK-LABEL: @test_vqmovn_high_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_s32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) { return vqmovn_high_s32(a, b); } -// CHECK-LABEL: @test_vqmovn_high_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_s64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) { return vqmovn_high_s64(a, b); } -// CHECK-LABEL: @test_vqmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vqmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// uint8x8_t test_vqmovn_u16(uint16x8_t a) { return vqmovn_u16(a); } -// CHECK-LABEL: @test_vqmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vqmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVN_V1_I]] +// uint16x4_t test_vqmovn_u32(uint32x4_t a) { return vqmovn_u32(a); } -// CHECK-LABEL: @test_vqmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vqmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVN_V1_I]] +// uint32x2_t test_vqmovn_u64(uint64x2_t a) { return vqmovn_u64(a); } -// CHECK-LABEL: @test_vqmovn_high_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vqmovn_high_u16( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) { return vqmovn_high_u16(a, b); } -// CHECK-LABEL: @test_vqmovn_high_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vqmovn_high_u32( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[VQMOVN_V1_I_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) { return vqmovn_high_u32(a, b); } -// CHECK-LABEL: @test_vqmovn_high_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %b) -// CHECK: [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vqmovn_high_u64( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[VQMOVN_V1_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) { return vqmovn_high_u64(a, b); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 8); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 16); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 32); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 8); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 16); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 32); } -// CHECK-LABEL: @test_vshll_high_n_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_high_n_s8(int8x16_t a) { return vshll_high_n_s8(a, 8); } -// CHECK-LABEL: @test_vshll_high_n_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_high_n_s16(int16x8_t a) { return vshll_high_n_s16(a, 16); } -// CHECK-LABEL: @test_vshll_high_n_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_high_n_s32(int32x4_t a) { return vshll_high_n_s32(a, 32); } -// CHECK-LABEL: @test_vshll_high_n_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 8) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vshll_high_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <8 x i16> [[TMP0]], splat (i16 8) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_high_n_u8(uint8x16_t a) { return vshll_high_n_u8(a, 8); } -// CHECK-LABEL: @test_vshll_high_n_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vshll_high_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[SHUFFLE_I]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_high_n_u16(uint16x8_t a) { return vshll_high_n_u16(a, 16); } -// CHECK-LABEL: @test_vshll_high_n_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 32) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vshll_high_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[SHUFFLE_I]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw <2 x i64> [[TMP0]], splat (i64 32) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_high_n_u32(uint32x4_t a) { return vshll_high_n_u32(a, 32); } -// CHECK-LABEL: @test_vcvt_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) -// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); } -// CHECK-LABEL: @test_vcvt_high_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %b) -// CHECK: [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> -// CHECK: ret <8 x half> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt_high_f16_f32( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[B]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <4 x half> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[TMP0]], <8 x i32> +// CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] +// float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) { return vcvt_high_f16_f32(a, b); } -// CHECK-LABEL: @test_vcvt_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptrunc <2 x double> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptrunc <2 x double> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_f64(float64x2_t a) { return vcvt_f32_f64(a); } -// CHECK-LABEL: @test_vcvt_high_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VCVT_I_I:%.*]] = fptrunc <2 x double> %b to <2 x float> -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f64( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[B]] to <2 x float> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVT_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) { return vcvt_high_f32_f64(a, b); } -// CHECK-LABEL: @test_vcvtx_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %a) -// CHECK: ret <2 x float> [[VCVTX_F32_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvtx_f32_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VCVTX_F32_V1_I]] +// float32x2_t test_vcvtx_f32_f64(float64x2_t a) { return vcvtx_f32_f64(a); } -// CHECK-LABEL: @test_vcvtx_high_f32_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8> -// CHECK: [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %b) -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtx_high_f32_f64( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[B]]) +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) { return vcvtx_high_f32_f64(a, b); } -// CHECK-LABEL: @test_vcvt_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) +// CHECK-NEXT: ret <4 x float> [[VCVT_F32_F161_I]] +// float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_high_f32_f16( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) -// CHECK: [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvt_high_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[VCVT_F32_F16_I_I:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) +// CHECK-NEXT: ret <4 x float> [[VCVT_F32_F161_I_I]] +// float32x4_t test_vcvt_high_f32_f16(float16x8_t a) { return vcvt_high_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_f64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fpext <2 x float> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_f64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fpext <2 x float> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvt_f64_f32(float32x2_t a) { return vcvt_f64_f32(a); } -// CHECK-LABEL: @test_vcvt_high_f64_f32( -// CHECK: [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8> -// CHECK: [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I_I]] to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvt_high_f64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[VCVT_I_I:%.*]] = fpext <2 x float> [[SHUFFLE_I]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I_I]] +// float64x2_t test_vcvt_high_f64_f32(float32x4_t a) { return vcvt_high_f64_f32(a); } -// CHECK-LABEL: @test_vrndnq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDN1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndnq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <2 x double> @llvm.roundeven.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDN1_I]] +// float64x2_t test_vrndnq_f64(float64x2_t a) { return vrndnq_f64(a); } -// CHECK-LABEL: @test_vrndaq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDA1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndaq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDA1_I]] +// float64x2_t test_vrndaq_f64(float64x2_t a) { return vrndaq_f64(a); } -// CHECK-LABEL: @test_vrndpq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDP1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndpq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDP1_I]] +// float64x2_t test_vrndpq_f64(float64x2_t a) { return vrndpq_f64(a); } -// CHECK-LABEL: @test_vrndmq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDM1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndmq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDM1_I]] +// float64x2_t test_vrndmq_f64(float64x2_t a) { return vrndmq_f64(a); } -// CHECK-LABEL: @test_vrndxq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDX1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndxq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDX1_I]] +// float64x2_t test_vrndxq_f64(float64x2_t a) { return vrndxq_f64(a); } -// CHECK-LABEL: @test_vrndq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDZ1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDZ1_I]] +// float64x2_t test_vrndq_f64(float64x2_t a) { return vrndq_f64(a); } -// CHECK-LABEL: @test_vrndiq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrndiq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRNDIQ_V1_I]] +// float64x2_t test_vrndiq_f64(float64x2_t a) { return vrndiq_f64(a); } -// CHECK-LABEL: @test_vcvt_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzs.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzs.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzs.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// int64x2_t test_vcvtq_s64_f64(float64x2_t a) { return vcvtq_s64_f64(a); } -// CHECK-LABEL: @test_vcvt_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvt_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtzu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTZ1_I]] +// uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[TMP1]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtzu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTZ1_I]] +// uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[TMP1]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtzu.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTZ1_I]] +// uint64x2_t test_vcvtq_u64_f64(float64x2_t a) { return vcvtq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtn_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTN1_I]] +// int32x2_t test_vcvtn_s32_f32(float32x2_t a) { return vcvtn_s32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTN1_I]] +// int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { return vcvtnq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTN1_I]] +// int64x2_t test_vcvtnq_s64_f64(float64x2_t a) { return vcvtnq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtn_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtn_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTN1_I]] +// uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { return vcvtn_u32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtnq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTN1_I]] +// uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { return vcvtnq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtnq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTN1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtnq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTN1_I]] +// uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) { return vcvtnq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtp_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTP1_I]] +// int32x2_t test_vcvtp_s32_f32(float32x2_t a) { return vcvtp_s32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTP1_I]] +// int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { return vcvtpq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTP1_I]] +// int64x2_t test_vcvtpq_s64_f64(float64x2_t a) { return vcvtpq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtp_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtp_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTP1_I]] +// uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { return vcvtp_u32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtpq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTP1_I]] +// uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { return vcvtpq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtpq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTP1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtpq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTP1_I]] +// uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) { return vcvtpq_u64_f64(a); } -// CHECK-LABEL: @test_vcvtm_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTM1_I]] +// int32x2_t test_vcvtm_s32_f32(float32x2_t a) { return vcvtm_s32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTM1_I]] +// int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { return vcvtmq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTM1_I]] +// int64x2_t test_vcvtmq_s64_f64(float64x2_t a) { return vcvtmq_s64_f64(a); } -// CHECK-LABEL: @test_vcvtm_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvtm_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTM1_I]] +// uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { return vcvtm_u32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtmq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTM1_I]] +// uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) { return vcvtmq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtmq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTM1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtmq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTM1_I]] +// uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) { return vcvtmq_u64_f64(a); } -// CHECK-LABEL: @test_vcvta_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTA1_I]] +// int32x2_t test_vcvta_s32_f32(float32x2_t a) { return vcvta_s32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTA1_I]] +// int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { return vcvtaq_s32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_s64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_s64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTA1_I]] +// int64x2_t test_vcvtaq_s64_f64(float64x2_t a) { return vcvtaq_s64_f64(a); } -// CHECK-LABEL: @test_vcvta_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %a) -// CHECK: ret <2 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vcvta_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCVTA1_I]] +// uint32x2_t test_vcvta_u32_f32(float32x2_t a) { return vcvta_u32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %a) -// CHECK: ret <4 x i32> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vcvtaq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCVTA1_I]] +// uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { return vcvtaq_u32_f32(a); } -// CHECK-LABEL: @test_vcvtaq_u64_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %a) -// CHECK: ret <2 x i64> [[VCVTA1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcvtaq_u64_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VCVTA1_I]] +// uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) { return vcvtaq_u64_f64(a); } -// CHECK-LABEL: @test_vrsqrte_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrsqrte_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTE_V1_I]] +// float32x2_t test_vrsqrte_f32(float32x2_t a) { return vrsqrte_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrsqrteq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTEQ_V1_I]] +// float32x4_t test_vrsqrteq_f32(float32x4_t a) { return vrsqrteq_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrsqrteq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRSQRTEQ_V1_I]] +// float64x2_t test_vrsqrteq_f64(float64x2_t a) { return vrsqrteq_f64(a); } -// CHECK-LABEL: @test_vrecpe_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrecpe_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRECPE_V1_I]] +// float32x2_t test_vrecpe_f32(float32x2_t a) { return vrecpe_f32(a); } -// CHECK-LABEL: @test_vrecpeq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrecpeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRECPEQ_V1_I]] +// float32x4_t test_vrecpeq_f32(float32x4_t a) { return vrecpeq_f32(a); } -// CHECK-LABEL: @test_vrecpeq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrecpeq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRECPEQ_V1_I]] +// float64x2_t test_vrecpeq_f64(float64x2_t a) { return vrecpeq_f64(a); } -// CHECK-LABEL: @test_vrecpe_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRECPE_V1_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vrecpe_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VRECPE_V1_I]] +// uint32x2_t test_vrecpe_u32(uint32x2_t a) { return vrecpe_u32(a); } -// CHECK-LABEL: @test_vrecpeq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vrecpeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VRECPEQ_V1_I]] +// uint32x4_t test_vrecpeq_u32(uint32x4_t a) { return vrecpeq_u32(a); } -// CHECK-LABEL: @test_vsqrt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vsqrt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VSQRT_I]] +// float32x2_t test_vsqrt_f32(float32x2_t a) { return vsqrt_f32(a); } -// CHECK-LABEL: @test_vsqrtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vsqrtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VSQRT_I]] +// float32x4_t test_vsqrtq_f32(float32x4_t a) { return vsqrtq_f32(a); } -// CHECK-LABEL: @test_vsqrtq_f64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8> -// CHECK: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[VSQRT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vsqrtq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VSQRT_I]] +// float64x2_t test_vsqrtq_f64(float64x2_t a) { return vsqrtq_f64(a); } -// CHECK-LABEL: @test_vcvt_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_s32(int32x2_t a) { return vcvt_f32_s32(a); } -// CHECK-LABEL: @test_vcvt_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vcvt_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_u32(uint32x2_t a) { return vcvt_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_s32(int32x4_t a) { return vcvtq_f32_s32(a); } -// CHECK-LABEL: @test_vcvtq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vcvtq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { return vcvtq_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f64_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvtq_f64_s64(int64x2_t a) { return vcvtq_f64_s64(a); } -// CHECK-LABEL: @test_vcvtq_f64_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i64> %a to <2 x double> -// CHECK: ret <2 x double> [[VCVT_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vcvtq_f64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i64> [[A]] to <2 x double> +// CHECK-NEXT: ret <2 x double> [[VCVT_I]] +// float64x2_t test_vcvtq_f64_u64(uint64x2_t a) { return vcvtq_f64_u64(a); } diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c index 1ffbbd5d9bc42..7b2e36c31bfae 100644 --- a/clang/test/CodeGen/AArch64/neon-perm.c +++ b/clang/test/CodeGen/AArch64/neon-perm.c @@ -1,1932 +1,2023 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vuzp1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) { return vuzp1_s8(a, b); } -// CHECK-LABEL: @test_vuzp1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) { return vuzp1q_s8(a, b); } -// CHECK-LABEL: @test_vuzp1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) { return vuzp1_s16(a, b); } -// CHECK-LABEL: @test_vuzp1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) { return vuzp1q_s16(a, b); } -// CHECK-LABEL: @test_vuzp1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) { return vuzp1_s32(a, b); } -// CHECK-LABEL: @test_vuzp1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) { return vuzp1q_s32(a, b); } -// CHECK-LABEL: @test_vuzp1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) { return vuzp1q_s64(a, b); } -// CHECK-LABEL: @test_vuzp1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) { return vuzp1_u8(a, b); } -// CHECK-LABEL: @test_vuzp1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) { return vuzp1q_u8(a, b); } -// CHECK-LABEL: @test_vuzp1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) { return vuzp1_u16(a, b); } -// CHECK-LABEL: @test_vuzp1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) { return vuzp1q_u16(a, b); } -// CHECK-LABEL: @test_vuzp1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) { return vuzp1_u32(a, b); } -// CHECK-LABEL: @test_vuzp1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) { return vuzp1q_u32(a, b); } -// CHECK-LABEL: @test_vuzp1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) { return vuzp1q_u64(a, b); } -// CHECK-LABEL: @test_vuzp1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vuzp1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) { return vuzp1_f32(a, b); } -// CHECK-LABEL: @test_vuzp1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vuzp1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) { return vuzp1q_f32(a, b); } -// CHECK-LABEL: @test_vuzp1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vuzp1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) { return vuzp1q_f64(a, b); } -// CHECK-LABEL: @test_vuzp1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) { return vuzp1_p8(a, b); } -// CHECK-LABEL: @test_vuzp1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) { return vuzp1q_p8(a, b); } -// CHECK-LABEL: @test_vuzp1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) { return vuzp1_p16(a, b); } -// CHECK-LABEL: @test_vuzp1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) { return vuzp1q_p16(a, b); } -// CHECK-LABEL: @test_vuzp2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) { return vuzp2_s8(a, b); } -// CHECK-LABEL: @test_vuzp2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) { return vuzp2q_s8(a, b); } -// CHECK-LABEL: @test_vuzp2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) { return vuzp2_s16(a, b); } -// CHECK-LABEL: @test_vuzp2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) { return vuzp2q_s16(a, b); } -// CHECK-LABEL: @test_vuzp2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) { return vuzp2_s32(a, b); } -// CHECK-LABEL: @test_vuzp2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) { return vuzp2q_s32(a, b); } -// CHECK-LABEL: @test_vuzp2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) { return vuzp2q_s64(a, b); } -// CHECK-LABEL: @test_vuzp2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) { return vuzp2_u8(a, b); } -// CHECK-LABEL: @test_vuzp2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) { return vuzp2q_u8(a, b); } -// CHECK-LABEL: @test_vuzp2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) { return vuzp2_u16(a, b); } -// CHECK-LABEL: @test_vuzp2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) { return vuzp2q_u16(a, b); } -// CHECK-LABEL: @test_vuzp2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vuzp2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) { return vuzp2_u32(a, b); } -// CHECK-LABEL: @test_vuzp2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vuzp2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) { return vuzp2q_u32(a, b); } -// CHECK-LABEL: @test_vuzp2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) { return vuzp2q_u64(a, b); } -// CHECK-LABEL: @test_vuzp2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vuzp2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) { return vuzp2_f32(a, b); } -// CHECK-LABEL: @test_vuzp2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vuzp2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) { return vuzp2q_f32(a, b); } -// CHECK-LABEL: @test_vuzp2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vuzp2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) { return vuzp2q_f64(a, b); } -// CHECK-LABEL: @test_vuzp2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) { return vuzp2_p8(a, b); } -// CHECK-LABEL: @test_vuzp2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) { return vuzp2q_p8(a, b); } -// CHECK-LABEL: @test_vuzp2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vuzp2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) { return vuzp2_p16(a, b); } -// CHECK-LABEL: @test_vuzp2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vuzp2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) { return vuzp2q_p16(a, b); } -// CHECK-LABEL: @test_vzip1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) { return vzip1_s8(a, b); } -// CHECK-LABEL: @test_vzip1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) { return vzip1q_s8(a, b); } -// CHECK-LABEL: @test_vzip1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) { return vzip1_s16(a, b); } -// CHECK-LABEL: @test_vzip1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) { return vzip1q_s16(a, b); } -// CHECK-LABEL: @test_vzip1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) { return vzip1_s32(a, b); } -// CHECK-LABEL: @test_vzip1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) { return vzip1q_s32(a, b); } -// CHECK-LABEL: @test_vzip1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) { return vzip1q_s64(a, b); } -// CHECK-LABEL: @test_vzip1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) { return vzip1_u8(a, b); } -// CHECK-LABEL: @test_vzip1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) { return vzip1q_u8(a, b); } -// CHECK-LABEL: @test_vzip1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) { return vzip1_u16(a, b); } -// CHECK-LABEL: @test_vzip1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) { return vzip1q_u16(a, b); } -// CHECK-LABEL: @test_vzip1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) { return vzip1_u32(a, b); } -// CHECK-LABEL: @test_vzip1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) { return vzip1q_u32(a, b); } -// CHECK-LABEL: @test_vzip1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) { return vzip1q_u64(a, b); } -// CHECK-LABEL: @test_vzip1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vzip1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) { return vzip1_f32(a, b); } -// CHECK-LABEL: @test_vzip1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vzip1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) { return vzip1q_f32(a, b); } -// CHECK-LABEL: @test_vzip1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vzip1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) { return vzip1q_f64(a, b); } -// CHECK-LABEL: @test_vzip1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) { return vzip1_p8(a, b); } -// CHECK-LABEL: @test_vzip1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) { return vzip1q_p8(a, b); } -// CHECK-LABEL: @test_vzip1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) { return vzip1_p16(a, b); } -// CHECK-LABEL: @test_vzip1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) { return vzip1q_p16(a, b); } -// CHECK-LABEL: @test_vzip2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) { return vzip2_s8(a, b); } -// CHECK-LABEL: @test_vzip2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) { return vzip2q_s8(a, b); } -// CHECK-LABEL: @test_vzip2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) { return vzip2_s16(a, b); } -// CHECK-LABEL: @test_vzip2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) { return vzip2q_s16(a, b); } -// CHECK-LABEL: @test_vzip2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) { return vzip2_s32(a, b); } -// CHECK-LABEL: @test_vzip2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) { return vzip2q_s32(a, b); } -// CHECK-LABEL: @test_vzip2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) { return vzip2q_s64(a, b); } -// CHECK-LABEL: @test_vzip2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) { return vzip2_u8(a, b); } -// CHECK-LABEL: @test_vzip2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) { return vzip2q_u8(a, b); } -// CHECK-LABEL: @test_vzip2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) { return vzip2_u16(a, b); } -// CHECK-LABEL: @test_vzip2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) { return vzip2q_u16(a, b); } -// CHECK-LABEL: @test_vzip2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vzip2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) { return vzip2_u32(a, b); } -// CHECK-LABEL: @test_vzip2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vzip2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) { return vzip2q_u32(a, b); } -// CHECK-LABEL: @test_vzip2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) { return vzip2q_u64(a, b); } -// CHECK-LABEL: @test_vzip2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vzip2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) { return vzip2_f32(a, b); } -// CHECK-LABEL: @test_vzip2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vzip2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) { return vzip2q_f32(a, b); } -// CHECK-LABEL: @test_vzip2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vzip2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) { return vzip2q_f64(a, b); } -// CHECK-LABEL: @test_vzip2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) { return vzip2_p8(a, b); } -// CHECK-LABEL: @test_vzip2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) { return vzip2q_p8(a, b); } -// CHECK-LABEL: @test_vzip2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vzip2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) { return vzip2_p16(a, b); } -// CHECK-LABEL: @test_vzip2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vzip2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) { return vzip2q_p16(a, b); } -// CHECK-LABEL: @test_vtrn1_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) { return vtrn1_s8(a, b); } -// CHECK-LABEL: @test_vtrn1q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) { return vtrn1q_s8(a, b); } -// CHECK-LABEL: @test_vtrn1_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) { return vtrn1_s16(a, b); } -// CHECK-LABEL: @test_vtrn1q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) { return vtrn1q_s16(a, b); } -// CHECK-LABEL: @test_vtrn1_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) { return vtrn1_s32(a, b); } -// CHECK-LABEL: @test_vtrn1q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) { return vtrn1q_s32(a, b); } -// CHECK-LABEL: @test_vtrn1q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) { return vtrn1q_s64(a, b); } -// CHECK-LABEL: @test_vtrn1_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) { return vtrn1_u8(a, b); } -// CHECK-LABEL: @test_vtrn1q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) { return vtrn1q_u8(a, b); } -// CHECK-LABEL: @test_vtrn1_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) { return vtrn1_u16(a, b); } -// CHECK-LABEL: @test_vtrn1q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) { return vtrn1q_u16(a, b); } -// CHECK-LABEL: @test_vtrn1_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) { return vtrn1_u32(a, b); } -// CHECK-LABEL: @test_vtrn1q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) { return vtrn1q_u32(a, b); } -// CHECK-LABEL: @test_vtrn1q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) { return vtrn1q_u64(a, b); } -// CHECK-LABEL: @test_vtrn1_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vtrn1_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) { return vtrn1_f32(a, b); } -// CHECK-LABEL: @test_vtrn1q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vtrn1q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) { return vtrn1q_f32(a, b); } -// CHECK-LABEL: @test_vtrn1q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vtrn1q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) { return vtrn1q_f64(a, b); } -// CHECK-LABEL: @test_vtrn1_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) { return vtrn1_p8(a, b); } -// CHECK-LABEL: @test_vtrn1q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) { return vtrn1q_p8(a, b); } -// CHECK-LABEL: @test_vtrn1_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) { return vtrn1_p16(a, b); } -// CHECK-LABEL: @test_vtrn1q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) { return vtrn1q_p16(a, b); } -// CHECK-LABEL: @test_vtrn2_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) { return vtrn2_s8(a, b); } -// CHECK-LABEL: @test_vtrn2q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) { return vtrn2q_s8(a, b); } -// CHECK-LABEL: @test_vtrn2_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) { return vtrn2_s16(a, b); } -// CHECK-LABEL: @test_vtrn2q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) { return vtrn2q_s16(a, b); } -// CHECK-LABEL: @test_vtrn2_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) { return vtrn2_s32(a, b); } -// CHECK-LABEL: @test_vtrn2q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) { return vtrn2q_s32(a, b); } -// CHECK-LABEL: @test_vtrn2q_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) { return vtrn2q_s64(a, b); } -// CHECK-LABEL: @test_vtrn2_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) { return vtrn2_u8(a, b); } -// CHECK-LABEL: @test_vtrn2q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) { return vtrn2q_u8(a, b); } -// CHECK-LABEL: @test_vtrn2_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) { return vtrn2_u16(a, b); } -// CHECK-LABEL: @test_vtrn2q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) { return vtrn2q_u16(a, b); } -// CHECK-LABEL: @test_vtrn2_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) { return vtrn2_u32(a, b); } -// CHECK-LABEL: @test_vtrn2q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) { return vtrn2q_u32(a, b); } -// CHECK-LABEL: @test_vtrn2q_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) { return vtrn2q_u64(a, b); } -// CHECK-LABEL: @test_vtrn2_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vtrn2_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) { return vtrn2_f32(a, b); } -// CHECK-LABEL: @test_vtrn2q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vtrn2q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) { return vtrn2q_f32(a, b); } -// CHECK-LABEL: @test_vtrn2q_f64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> -// CHECK: ret <2 x double> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x double> @test_vtrn2q_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x double> [[SHUFFLE_I]] +// float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) { return vtrn2q_f64(a, b); } -// CHECK-LABEL: @test_vtrn2_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) { return vtrn2_p8(a, b); } -// CHECK-LABEL: @test_vtrn2q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) { return vtrn2q_p8(a, b); } -// CHECK-LABEL: @test_vtrn2_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) { return vtrn2_p16(a, b); } -// CHECK-LABEL: @test_vtrn2q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) { return vtrn2q_p16(a, b); } -// CHECK-LABEL: @test_vuzp_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vuzp_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[TMP1]] +// int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK-LABEL: @test_vuzp_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vuzp_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[TMP1]] +// int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK-LABEL: @test_vuzp_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vuzp_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[TMP1]] +// int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK-LABEL: @test_vuzp_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vuzp_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[TMP1]] +// uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK-LABEL: @test_vuzp_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vuzp_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[TMP1]] +// uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK-LABEL: @test_vuzp_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vuzp_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[TMP1]] +// uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK-LABEL: @test_vuzp_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vuzp_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]] +// float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK-LABEL: @test_vuzp_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vuzp_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[TMP1]] +// poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK-LABEL: @test_vuzp_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vuzp_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[TMP1]] +// poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK-LABEL: @test_vuzpq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vuzpq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[TMP1]] +// int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK-LABEL: @test_vuzpq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vuzpq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[TMP1]] +// int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK-LABEL: @test_vuzpq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vuzpq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[TMP1]] +// int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK-LABEL: @test_vuzpq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vuzpq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[TMP1]] +// uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK-LABEL: @test_vuzpq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vuzpq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[TMP1]] +// uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK-LABEL: @test_vuzpq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vuzpq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[TMP1]] +// uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK-LABEL: @test_vuzpq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vuzpq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]] +// float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK-LABEL: @test_vuzpq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vuzpq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[TMP1]] +// poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK-LABEL: @test_vuzpq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vuzpq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[TMP1]] +// poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK-LABEL: @test_vzip_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vzip_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[TMP1]] +// int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK-LABEL: @test_vzip_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vzip_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[TMP1]] +// int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK-LABEL: @test_vzip_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vzip_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[TMP1]] +// int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK-LABEL: @test_vzip_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vzip_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[TMP1]] +// uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK-LABEL: @test_vzip_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vzip_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[TMP1]] +// uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK-LABEL: @test_vzip_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vzip_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[TMP1]] +// uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK-LABEL: @test_vzip_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vzip_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]] +// float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK-LABEL: @test_vzip_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vzip_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[TMP1]] +// poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK-LABEL: @test_vzip_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vzip_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[TMP1]] +// poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK-LABEL: @test_vzipq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vzipq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[TMP1]] +// int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK-LABEL: @test_vzipq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vzipq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[TMP1]] +// int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK-LABEL: @test_vzipq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vzipq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[TMP1]] +// int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK-LABEL: @test_vzipq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vzipq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[TMP1]] +// uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK-LABEL: @test_vzipq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vzipq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[TMP1]] +// uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK-LABEL: @test_vzipq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vzipq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[TMP1]] +// uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK-LABEL: @test_vzipq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vzipq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]] +// float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK-LABEL: @test_vzipq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vzipq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[TMP1]] +// poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK-LABEL: @test_vzipq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vzipq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[TMP1]] +// poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); } -// CHECK-LABEL: @test_vtrn_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.int8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X8X2_T]] [[TMP1]] +// int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK-LABEL: @test_vtrn_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x4x2_t @test_vtrn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X4X2_T]] [[TMP1]] +// int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK-LABEL: @test_vtrn_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.int32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.int32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x2x2_t @test_vtrn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X2X2_T]] [[TMP1]] +// int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK-LABEL: @test_vtrn_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x8x2_t @test_vtrn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X8X2_T]] [[TMP1]] +// uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK-LABEL: @test_vtrn_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x4x2_t @test_vtrn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X4X2_T]] [[TMP1]] +// uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK-LABEL: @test_vtrn_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x i32>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.uint32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x2x2_t @test_vtrn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x i32>] [[TMP0]], <2 x i32> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X2X2_T:%.*]] poison, [2 x <2 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X2X2_T]] [[TMP1]] +// uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK-LABEL: @test_vtrn_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x2x2_t [[TMP7]], 0 -// CHECK: store [2 x <2 x float>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.float32x2x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.float32x2x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x2x2_t @test_vtrn_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x float>] poison, <2 x float> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <2 x float>] [[TMP0]], <2 x float> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X2X2_T:%.*]] poison, [2 x <2 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X2X2_T]] [[TMP1]] +// float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK-LABEL: @test_vtrn_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0 -// CHECK: store [2 x <8 x i8>] [[TMP7]], ptr [[TMP6]], align 8 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x8x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly8x8x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x8x2_t @test_vtrn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i8>] [[TMP0]], <8 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X8X2_T:%.*]] poison, [2 x <8 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X8X2_T]] [[TMP1]] +// poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK-LABEL: @test_vtrn_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL_I]], align 8 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i16>] [[TMP9]], ptr [[TMP8]], align 8 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x4x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly16x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x4x2_t @test_vtrn_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i16>] poison, <4 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i16>] [[TMP0]], <4 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X4X2_T:%.*]] poison, [2 x <4 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X4X2_T]] [[TMP1]] +// poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK-LABEL: @test_vtrnq_s8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.int8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.int8x16x2_t @test_vtrnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT8X16X2_T]] [[TMP1]] +// int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK-LABEL: @test_vtrnq_s16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int16x8x2_t @test_vtrnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT16X8X2_T]] [[TMP1]] +// int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK-LABEL: @test_vtrnq_s32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.int32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.int32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.int32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.int32x4x2_t @test_vtrnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_INT32X4X2_T]] [[TMP1]] +// int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK-LABEL: @test_vtrnq_u8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.uint8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.uint8x16x2_t @test_vtrnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT8X16X2_T]] [[TMP1]] +// uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK-LABEL: @test_vtrnq_u16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint16x8x2_t @test_vtrnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT16X8X2_T]] [[TMP1]] +// uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK-LABEL: @test_vtrnq_u32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x i32>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.uint32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.uint32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.uint32x4x2_t @test_vtrnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x i32>] [[TMP0]], <4 x i32> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_UINT32X4X2_T:%.*]] poison, [2 x <4 x i32>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_UINT32X4X2_T]] [[TMP1]] +// uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK-LABEL: @test_vtrnq_f32( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.float32x4x2_t [[TMP7]], 0 -// CHECK: store [2 x <4 x float>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.float32x4x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.float32x4x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.float32x4x2_t @test_vtrnq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x float>] poison, <4 x float> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x float>] [[TMP0]], <4 x float> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT32X4X2_T:%.*]] poison, [2 x <4 x float>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT32X4X2_T]] [[TMP1]] +// float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK-LABEL: @test_vtrnq_p8( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]] -// CHECK: [[TMP5:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP6:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0 -// CHECK: store [2 x <16 x i8>] [[TMP7]], ptr [[TMP6]], align 16 -// CHECK: [[TMP8:%.*]] = load %struct.poly8x16x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly8x16x2_t [[TMP8]] +// CHECK-LABEL: define dso_local %struct.poly8x16x2_t @test_vtrnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <16 x i8>] [[TMP0]], <16 x i8> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY8X16X2_T:%.*]] poison, [2 x <16 x i8>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY8X16X2_T]] [[TMP1]] +// poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK-LABEL: @test_vtrnq_p16( -// CHECK: [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[RETVAL_I]] -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[RETVAL_I]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]] -// CHECK: [[TMP7:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL_I]], align 16 -// CHECK: [[TMP8:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[RETVAL]], i32 0, i32 0 -// CHECK: [[TMP9:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP7]], 0 -// CHECK: store [2 x <8 x i16>] [[TMP9]], ptr [[TMP8]], align 16 -// CHECK: [[TMP10:%.*]] = load %struct.poly16x8x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly16x8x2_t [[TMP10]] +// CHECK-LABEL: define dso_local %struct.poly16x8x2_t @test_vtrnq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x i16>] poison, <8 x i16> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x i16>] [[TMP0]], <8 x i16> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY16X8X2_T:%.*]] poison, [2 x <8 x i16>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY16X8X2_T]] [[TMP1]] +// poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); } diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c index 1d0db697e4fdd..38efeb2559fd4 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.c @@ -1,17 +1,11 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ // RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefix=CONSTRAINED %s // REQUIRES: aarch64-registered-target @@ -19,113 +13,162 @@ #include -// COMMON-LABEL: test_vfmas_lane_f32 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) -// CONSTRAINED: [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float %b, float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}] -// COMMONIR: ret float [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]]) +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vfmas_lane_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[B]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmas_lane_f32(a, b, c, 1); } -// COMMON-LABEL: test_vfmad_lane_f64 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0 -// UNCONSTRAINED: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CONSTRAINED: [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret double [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vfmad_lane_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); } -// COMMON-LABEL: test_vfmad_laneq_f64 -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CONSTRAINED: [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double %b, double [[EXTRACT]], double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}] -// COMMONIR: ret double [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64( +// UNCONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// UNCONSTRAINED-NEXT: ret double [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local double @test_vfmad_laneq_f64( +// CONSTRAINED-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[B]], double [[EXTRACT]], double [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { return vfmad_laneq_f64(a, b, c, 1); } -// COMMON-LABEL: test_vfmss_lane_f32 -// COMMONIR: [[SUB:%.*]] = fneg float %b -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// UNCONSTRAINED: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a) -// CONSTRAINED: [[TMP2:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[SUB]], float [[EXTRACT]], float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}.s[{{[0-9]+}}] -// COMMONIR: ret float [[TMP2]] +// UNCONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32( +// UNCONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]]) +// UNCONSTRAINED-NEXT: ret float [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local float @test_vfmss_lane_f32( +// CONSTRAINED-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret float [[TMP0]] +// float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmss_lane_f32(a, b, c, 1); } -// COMMON-LABEL: test_vfma_lane_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// COMMONIR: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// COMMONIR: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// COMMONIR: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// UNCONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret <1 x double> [[FMLA2]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfms_lane_f64 -// COMMONIR: [[SUB:%.*]] = fneg <1 x double> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// COMMONIR: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// COMMONIR: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// COMMONIR: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// UNCONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CONSTRAINED: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: ret <1 x double> [[FMLA2]] + +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]]) +// UNCONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfma_laneq_f64 -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// COMMONIR: ret <1 x double> [[TMP7]] + +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]]) +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP3]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double> +// CONSTRAINED-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); } -// COMMON-LABEL: test_vfms_laneq_f64 -// COMMONIR: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK-ASM: fneg d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// COMMONIR: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// UNCONSTRAINED: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CONSTRAINED: [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} -// COMMONIR: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// COMMONIR: ret <1 x double> [[TMP7]] +// UNCONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// UNCONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// UNCONSTRAINED-NEXT: [[TMP3:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]]) +// UNCONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> +// UNCONSTRAINED-NEXT: ret <1 x double> [[TMP4]] +// +// CONSTRAINED-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// CONSTRAINED-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CONSTRAINED-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// CONSTRAINED-NEXT: [[TMP3:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> +// CONSTRAINED-NEXT: ret <1 x double> [[TMP4]] +// float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); } - diff --git a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c index 8b7b976ab5e5a..1d6bc1eda9ad9 100644 --- a/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c +++ b/clang/test/CodeGen/AArch64/neon-scalar-x-indexed-elem.c @@ -1,419 +1,507 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} float @test_vmuls_lane_f32(float noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1 -// CHECK: [[MUL:%.*]] = fmul float %a, [[VGET_LANE]] -// CHECK: ret float [[MUL]] +// CHECK-LABEL: define dso_local float @test_vmuls_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i64 1 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[A]], [[VGET_LANE]] +// CHECK-NEXT: ret float [[MUL]] +// float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) { return vmuls_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} double @test_vmuld_lane_f64(double noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[MUL:%.*]] = fmul double %a, [[VGET_LANE]] -// CHECK: ret double [[MUL]] +// CHECK-LABEL: define dso_local double @test_vmuld_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[VGET_LANE]] +// CHECK-NEXT: ret double [[MUL]] +// float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) { return vmuld_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} float @test_vmuls_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3 -// CHECK: [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]] -// CHECK: ret float [[MUL]] +// CHECK-LABEL: define dso_local float @test_vmuls_laneq_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i64 3 +// CHECK-NEXT: [[MUL:%.*]] = fmul float [[A]], [[VGETQ_LANE]] +// CHECK-NEXT: ret float [[MUL]] +// float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) { return vmuls_laneq_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} double @test_vmuld_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]] -// CHECK: ret double [[MUL]] +// CHECK-LABEL: define dso_local double @test_vmuld_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1 +// CHECK-NEXT: [[MUL:%.*]] = fmul double [[A]], [[VGETQ_LANE]] +// CHECK-NEXT: ret double [[MUL]] +// float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) { return vmuld_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmul_n_f64(<1 x double> noundef %a, double noundef %b) #0 { -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %a to double -// CHECK: [[TMP3:%.*]] = fmul double [[TMP2]], %b -// CHECK: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> -// CHECK: ret <1 x double> [[TMP4]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmul_n_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = fmul double [[TMP0]], [[B]] +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP2]] +// float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) { return vmul_n_f64(a, b); } -// CHECK-LABEL: define{{.*}} float @test_vmulxs_lane_f32(float noundef %a, <2 x float> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1 -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[B]], i64 1 +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGET_LANE]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) { return vmulxs_lane_f32(a, b, 1); } -// CHECK-LABEL: define{{.*}} float @test_vmulxs_laneq_f32(float noundef %a, <4 x float> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3 -// CHECK: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) -// CHECK: ret float [[VMULXS_F32_I]] +// CHECK-LABEL: define dso_local float @test_vmulxs_laneq_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B]], i64 3 +// CHECK-NEXT: [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A]], float [[VGETQ_LANE]]) +// CHECK-NEXT: ret float [[VMULXS_F32_I]] +// float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) { return vmulxs_laneq_f32(a, b, 3); } -// CHECK-LABEL: define{{.*}} double @test_vmulxd_lane_f64(double noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGET_LANE]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) { return vmulxd_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} double @test_vmulxd_laneq_f64(double noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) -// CHECK: ret double [[VMULXD_F64_I]] +// CHECK-LABEL: define dso_local double @test_vmulxd_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A]], double [[VGETQ_LANE]]) +// CHECK-NEXT: ret double [[VMULXD_F64_I]] +// float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) { return vmulxd_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGET_LANE6:%.*]] = extractelement <1 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) { return vmulx_lane_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_0(<1 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_0( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 0 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_1(<1 x double> noundef %a, <2 x double> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_1( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B]], i64 1 +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) { return vmulx_laneq_f64(a, b, 1); } -// CHECK-LABEL: define{{.*}} float @test_vfmas_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a) -// CHECK: ret float [[TMP2]] +// CHECK-LABEL: define dso_local float @test_vfmas_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B]], float [[EXTRACT]], float [[A]]) +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmas_lane_f32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} double @test_vfmad_lane_f64(double noundef %a, double noundef %b, <1 x double> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0 -// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CHECK: ret double [[TMP2]] +// CHECK-LABEL: define dso_local double @test_vfmad_lane_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <1 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <1 x double> [[C]], i64 0 +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) { return vfmad_lane_f64(a, b, c, 0); } -// CHECK-LABEL: define{{.*}} double @test_vfmad_laneq_f64(double noundef %a, double noundef %b, <2 x double> noundef %c) #0 { -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a) -// CHECK: ret double [[TMP2]] +// CHECK-LABEL: define dso_local double @test_vfmad_laneq_f64( +// CHECK-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[C]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B]], double [[EXTRACT]], double [[A]]) +// CHECK-NEXT: ret double [[TMP0]] +// float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) { return vfmad_laneq_f64(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} float @test_vfmss_lane_f32(float noundef %a, float noundef %b, <2 x float> noundef %c) #0 { -// CHECK: [[SUB:%.*]] = fneg float %b -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1 -// CHECK: [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a) -// CHECK: ret float [[TMP2]] +// CHECK-LABEL: define dso_local float @test_vfmss_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], float noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG:%.*]] = fneg float [[B]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x float> [[C]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A]]) +// CHECK-NEXT: ret float [[TMP0]] +// float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) { return vfmss_lane_f32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CHECK: ret <1 x double> [[FMLA2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[V]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfma_lane_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_lane_f64(<1 x double> noundef %a, <1 x double> noundef %b, <1 x double> noundef %v) #0 { -// CHECK: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double> -// CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double> -// CHECK: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]]) -// CHECK: ret <1 x double> [[FMLA2]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_lane_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG:%.*]] = fneg <1 x double> [[B]] +// CHECK-NEXT: [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG]], <1 x double> [[V]], <1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[FMLA2]] +// float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) { return vfms_lane_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK: ret <1 x double> [[TMP7]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfma_laneq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[EXTRACT]], double [[TMP0]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP3]] +// float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfma_laneq_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_laneq_f64(<1 x double> noundef %a, <1 x double> noundef %b, <2 x double> noundef %v) #0 { -// CHECK: [[SUB:%.*]] = fneg <1 x double> %b -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double -// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double> -// CHECK: [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 -// CHECK: [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]]) -// CHECK: [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double> -// CHECK: ret <1 x double> [[TMP7]] +// CHECK-LABEL: define dso_local <1 x double> @test_vfms_laneq_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]], <1 x double> noundef [[B:%.*]], <2 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x double> [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = extractelement <1 x double> [[B]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = fneg double [[TMP1]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <2 x double> [[V]], i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.fma.f64(double [[TMP2]], double [[EXTRACT]], double [[TMP0]]) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double> +// CHECK-NEXT: ret <1 x double> [[TMP4]] +// float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) { return vfms_laneq_f64(a, b, v, 0); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP4]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) { return vqdmullh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) { return vqdmulls_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 -// CHECK: ret i32 [[TMP4]] +// CHECK-LABEL: define dso_local i32 @test_vqdmullh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0 +// CHECK-NEXT: ret i32 [[TMP2]] +// int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) { return vqdmullh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i64 [[VQDMULLS_S32_I]] +// CHECK-LABEL: define dso_local i64 @test_vqdmulls_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i64 [[VQDMULLS_S32_I]] +// int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) { return vqdmulls_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqdmulhh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqdmulhs_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqdmulhh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqdmulhh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i32 [[VQDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqdmulhs_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i32 [[VQDMULHS_S32_I]] +// int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqdmulhs_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_lane_s16(i16 noundef %a, <4 x i16> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_lane_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) { return vqrdmulhh_lane_s16(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_lane_s32(i32 noundef %a, <2 x i32> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1 -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGET_LANE]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) { return vqrdmulhs_lane_s32(a, b, 1); } -// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_laneq_s16(i16 noundef %a, <8 x i16> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %a, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 -// CHECK: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 -// CHECK: ret i16 [[TMP4]] +// CHECK-LABEL: define dso_local i16 @test_vqrdmulhh_laneq_s16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0 +// CHECK-NEXT: ret i16 [[TMP2]] +// int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) { return vqrdmulhh_laneq_s16(a, b, 7); } -// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_laneq_s32(i32 noundef %a, <4 x i32> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3 -// CHECK: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) -// CHECK: ret i32 [[VQRDMULHS_S32_I]] +// CHECK-LABEL: define dso_local i32 @test_vqrdmulhs_laneq_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A]], i32 [[VGETQ_LANE]]) +// CHECK-NEXT: ret i32 [[VQRDMULHS_S32_I]] +// int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) { return vqrdmulhs_laneq_s32(a, b, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_lane_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlalh_lane_s16(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_lane_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <2 x i32> [[C]], i64 1 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlals_lane_s32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlalh_laneq_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlalh_laneq_s16(a, b, c, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlals_laneq_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i32> [[C]], i64 3 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlals_laneq_s32(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_lane_s16(i32 noundef %a, i16 noundef %b, <4 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_lane_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) { return vqdmlslh_lane_s16(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_lane_s32(i64 noundef %a, i32 noundef %b, <2 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_lane_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <2 x i32> [[C]], i64 1 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) { return vqdmlsls_lane_s32(a, b, c, 1); } -// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_laneq_s16(i32 noundef %a, i16 noundef %b, <8 x i16> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7 -// CHECK: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 %b, i64 0 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[LANE]], i64 0 -// CHECK: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) -// CHECK: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 -// CHECK: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]]) -// CHECK: ret i32 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i32 @test_vqdmlslh_laneq_s16( +// CHECK-SAME: i32 noundef [[A:%.*]], i16 noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[C]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) +// CHECK-NEXT: [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0 +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A]], i32 [[LANE0]]) +// CHECK-NEXT: ret i32 [[VQDMLXL1]] +// int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) { return vqdmlslh_laneq_s16(a, b, c, 7); } -// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_laneq_s32(i64 noundef %a, i32 noundef %b, <4 x i32> noundef %c) #0 { -// CHECK: [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3 -// CHECK: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]]) -// CHECK: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]]) -// CHECK: ret i64 [[VQDMLXL1]] +// CHECK-LABEL: define dso_local i64 @test_vqdmlsls_laneq_s32( +// CHECK-SAME: i64 noundef [[A:%.*]], i32 noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = extractelement <4 x i32> [[C]], i64 3 +// CHECK-NEXT: [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B]], i32 [[LANE]]) +// CHECK-NEXT: [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A]], i64 [[VQDMLXL]]) +// CHECK-NEXT: ret i64 [[VQDMLXL1]] +// int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) { return vqdmlsls_laneq_s32(a, b, c, 3); } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64_0() #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK: [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_lane_f64_0( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double 0x3FD6304BC43AB5C2, double 0x3FEE211E215AEEF3) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_lane_f64_0() { float64x1_t arg1; float64x1_t arg2; @@ -425,15 +513,13 @@ float64x1_t test_vmulx_lane_f64_0() { return result; } -// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_2() #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double> -// CHECK: [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double> -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0 -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1 -// CHECK: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0 -// CHECK: ret <1 x double> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x double> @test_vmulx_laneq_f64_2( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double 0x3FD6304BC43AB5C2, double 0x3FEE211E215AEEF3) +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x double> poison, double [[VMULXD_F64_I]], i64 0 +// CHECK-NEXT: ret <1 x double> [[VSET_LANE]] +// float64x1_t test_vmulx_laneq_f64_2() { float64x1_t arg1; float64x1_t arg2; diff --git a/clang/test/CodeGen/AArch64/poly-add.c b/clang/test/CodeGen/AArch64/poly-add.c index 0795aecac433f..d408e919639b2 100644 --- a/clang/test/CodeGen/AArch64/poly-add.c +++ b/clang/test/CodeGen/AArch64/poly-add.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -18,11 +18,8 @@ poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) { // CHECK-LABEL: @test_vadd_p16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] // poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) { return vadd_p16 (a, b); @@ -30,11 +27,8 @@ poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) { // CHECK-LABEL: @test_vadd_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] // poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) { return vadd_p64(a, b); @@ -51,11 +45,8 @@ poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){ // CHECK-LABEL: @test_vaddq_p16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] // poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){ return vaddq_p16(a, b); @@ -63,11 +54,8 @@ poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){ // CHECK-LABEL: @test_vaddq_p64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] // poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){ return vaddq_p64(a, b); @@ -75,11 +63,8 @@ poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){ // CHECK-LABEL: @test_vaddq_p128( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 -// CHECK-NEXT: ret i128 [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = xor i128 [[A:%.*]], [[B:%.*]] +// CHECK-NEXT: ret i128 [[TMP0]] // poly128_t test_vaddq_p128 (poly128_t a, poly128_t b){ return vaddq_p128(a, b); diff --git a/clang/test/CodeGen/AArch64/poly128.c b/clang/test/CodeGen/AArch64/poly128.c index f188632468fc8..be683b17e0d0b 100644 --- a/clang/test/CodeGen/AArch64/poly128.c +++ b/clang/test/CodeGen/AArch64/poly128.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -40,7 +40,7 @@ poly128_t test_vldrq_p128(poly128_t * ptr) { // CHECK-SAME: (ptr noundef [[PTR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[PTR]], align 16 -// CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i128, ptr [[PTR]], i64 1 +// CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 16 // CHECK-NEXT: store i128 [[TMP0]], ptr [[ADD_PTR]], align 16 // CHECK-NEXT: ret void // @@ -61,13 +61,11 @@ __attribute__((target("aes"))) poly128_t test_vmull_p64(poly64_t a, poly64_t b) } // CHECK-LABEL: define {{[^@]+}}@test_vmull_high_p64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I5:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64 -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> [[B]], <1 x i32> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to i64 -// CHECK-NEXT: [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) +// CHECK-NEXT: [[SHUFFLE_I5_EXTRACT:%.*]] = extractelement <2 x i64> [[A]], i64 1 +// CHECK-NEXT: [[SHUFFLE_I_EXTRACT:%.*]] = extractelement <2 x i64> [[B]], i64 1 +// CHECK-NEXT: [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[SHUFFLE_I5_EXTRACT]], i64 [[SHUFFLE_I_EXTRACT]]) // CHECK-NEXT: [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128 // CHECK-NEXT: ret i128 [[VMULL_P641_I_I]] // @@ -76,7 +74,7 @@ __attribute__((target("aes"))) poly128_t test_vmull_high_p64(poly64x2_t a, poly6 } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -86,7 +84,7 @@ poly128_t test_vreinterpretq_p128_s8(int8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -96,7 +94,7 @@ poly128_t test_vreinterpretq_p128_s16(int16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -106,7 +104,7 @@ poly128_t test_vreinterpretq_p128_s32(int32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -116,7 +114,7 @@ poly128_t test_vreinterpretq_p128_s64(int64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -126,7 +124,7 @@ poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -136,7 +134,7 @@ poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u32 -// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -146,7 +144,7 @@ poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -156,7 +154,7 @@ poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -166,7 +164,7 @@ poly128_t test_vreinterpretq_p128_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f64 -// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -176,7 +174,7 @@ poly128_t test_vreinterpretq_p128_f64(float64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p8 -// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -186,7 +184,7 @@ poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p16 -// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -196,7 +194,7 @@ poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p64 -// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128 // CHECK-NEXT: ret i128 [[TMP0]] @@ -206,7 +204,7 @@ poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -216,7 +214,7 @@ int8x16_t test_vreinterpretq_s8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -226,7 +224,7 @@ int16x8_t test_vreinterpretq_s16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] @@ -236,7 +234,7 @@ int32x4_t test_vreinterpretq_s32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] @@ -246,7 +244,7 @@ int64x2_t test_vreinterpretq_s64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -256,7 +254,7 @@ uint8x16_t test_vreinterpretq_u8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -266,7 +264,7 @@ uint16x8_t test_vreinterpretq_u16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32> // CHECK-NEXT: ret <4 x i32> [[TMP0]] @@ -276,7 +274,7 @@ uint32x4_t test_vreinterpretq_u32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] @@ -286,7 +284,7 @@ uint64x2_t test_vreinterpretq_u64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f32_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x float> // CHECK-NEXT: ret <4 x float> [[TMP0]] @@ -296,7 +294,7 @@ float32x4_t test_vreinterpretq_f32_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x double> // CHECK-NEXT: ret <2 x double> [[TMP0]] @@ -306,7 +304,7 @@ float64x2_t test_vreinterpretq_f64_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p8_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8> // CHECK-NEXT: ret <16 x i8> [[TMP0]] @@ -316,7 +314,7 @@ poly8x16_t test_vreinterpretq_p8_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p16_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[TMP0]] @@ -326,7 +324,7 @@ poly16x8_t test_vreinterpretq_p16_p128(poly128_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p64_p128 -// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR3]] { +// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64> // CHECK-NEXT: ret <2 x i64> [[TMP0]] diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c index f3c057ecf48c1..558833469d386 100644 --- a/clang/test/CodeGen/AArch64/poly64.c +++ b/clang/test/CodeGen/AArch64/poly64.c @@ -1,537 +1,717 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \ -// RUN: -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -ffp-contract=fast -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vceq_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vceq_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[SEXT_I]] +// uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) { return vceq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vceqq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[SEXT_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vceqq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[SEXT_I]] +// uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) { return vceqq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vtst_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP4:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64> -// CHECK: ret <1 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vtst_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <1 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <1 x i1> [[TMP1]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[VTST_I]] +// uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) { return vtst_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtstq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP4:%.*]] = and <2 x i64> %a, %b -// CHECK: [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64> -// CHECK: ret <2 x i64> [[VTST_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtstq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i64> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VTST_I]] +// uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) { return vtstq_p64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vbsl_p64(<1 x i64> noundef %a, <1 x i64> noundef %b, <1 x i64> noundef %c) #0 { -// CHECK: [[VBSL3_I:%.*]] = and <1 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = xor <1 x i64> %a, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], %c -// CHECK: [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <1 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vbsl_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[A]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <1 x i64> [[C]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <1 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <1 x i64> [[VBSL5_I]] +// poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) { return vbsl_p64(a, b, c); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vbslq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c) #0 { -// CHECK: [[VBSL3_I:%.*]] = and <2 x i64> %a, %b -// CHECK: [[TMP3:%.*]] = xor <2 x i64> %a, splat (i64 -1) -// CHECK: [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], %c -// CHECK: [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]] -// CHECK: ret <2 x i64> [[VBSL5_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vbslq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL3_I:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[A]], splat (i64 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <2 x i64> [[C]], [[TMP0]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <2 x i64> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: ret <2 x i64> [[VBSL5_I]] +// poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) { return vbslq_p64(a, b, c); } -// CHECK-LABEL: define{{.*}} i64 @test_vget_lane_p64(<1 x i64> noundef %v) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %v, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define dso_local i64 @test_vget_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[V]], i64 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// poly64_t test_vget_lane_p64(poly64x1_t v) { return vget_lane_p64(v, 0); } -// CHECK-LABEL: define{{.*}} i64 @test_vgetq_lane_p64(<2 x i64> noundef %v) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %v, i32 1 -// CHECK: ret i64 [[VGETQ_LANE]] +// CHECK-LABEL: define dso_local i64 @test_vgetq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[V]], i64 1 +// CHECK-NEXT: ret i64 [[VGETQ_LANE]] +// poly64_t test_vgetq_lane_p64(poly64x2_t v) { return vgetq_lane_p64(v, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vset_lane_p64(i64 noundef %a, <1 x i64> noundef %v) #0 { -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %v, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vset_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) { return vset_lane_p64(a, v, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsetq_lane_p64(i64 noundef %a, <2 x i64> noundef %v) #0 { -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %v, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsetq_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[V]], i64 [[A]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) { return vsetq_lane_p64(a, v, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcopy_lane_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0 -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %a, i64 [[VGET_LANE]], i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcopy_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[B]] +// poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) { return vcopy_lane_p64(a, 0, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_lane_p64(<2 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %b, i32 0 -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGET_LANE]], i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <1 x i64> [[B]], <1 x i64> poison, <2 x i32> +// CHECK-NEXT: [[VSET_LANE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[TMP0]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) { return vcopyq_lane_p64(a, 1, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> %b, i32 1 -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %a, i64 [[VGETQ_LANE]], i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_laneq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) { return vcopyq_laneq_p64(a, 1, b, 1); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vcreate_p64(i64 noundef %a) #0 { -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vcreate_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vcreate_p64(uint64_t a) { return vcreate_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VECINIT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VECINIT_I]] +// poly64x1_t test_vdup_n_p64(poly64_t a) { return vdup_n_p64(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// poly64x2_t test_vdupq_n_p64(poly64_t a) { return vdupq_n_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vmov_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VECINIT_I]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vmov_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VECINIT_I]] +// poly64x1_t test_vmov_n_p64(poly64_t a) { return vmov_n_p64(a); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vmovq_n_p64(i64 noundef %a) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vmovq_n_p64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// poly64x2_t test_vmovq_n_p64(poly64_t a) { return vmovq_n_p64(a); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vdup_lane_p64(<1 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vdup_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[VEC]] +// poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { return vdup_lane_p64(vec, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_lane_p64(<1 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[VEC]], <1 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { return vdupq_lane_p64(vec, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vdupq_laneq_p64(<2 x i64> noundef %vec) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vdupq_laneq_p64( +// CHECK-SAME: <2 x i64> noundef [[VEC:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[VEC]], <2 x i64> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { return vdupq_laneq_p64(vec, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vcombine_p64(<1 x i64> noundef %low, <1 x i64> noundef %high) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vcombine_p64( +// CHECK-SAME: <1 x i64> noundef [[LOW:%.*]], <1 x i64> noundef [[HIGH:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[LOW]], <1 x i64> [[HIGH]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) { return vcombine_p64(low, high); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_p64(ptr noundef %ptr) #0 { -// CHECK: [[TMP2:%.*]] = load <1 x i64>, ptr %ptr -// CHECK: ret <1 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vld1_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[PTR]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// poly64x1_t test_vld1_p64(poly64_t const * ptr) { return vld1_p64(ptr); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_p64(ptr noundef %ptr) #0 { -// CHECK: [[TMP2:%.*]] = load <2 x i64>, ptr %ptr -// CHECK: ret <2 x i64> [[TMP2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vld1q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR]], align 8 +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// poly64x2_t test_vld1q_p64(poly64_t const * ptr) { return vld1q_p64(ptr); } -// CHECK-LABEL: define{{.*}} void @test_vst1_p64(ptr noundef %ptr, <1 x i64> noundef %val) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: store <1 x i64> [[TMP3]], ptr %ptr -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <1 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <1 x i64> [[VAL]], ptr [[PTR]], align 8 +// CHECK-NEXT: ret void +// void test_vst1_p64(poly64_t * ptr, poly64x1_t val) { return vst1_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst1q_p64(ptr noundef %ptr, <2 x i64> noundef %val) #0 { -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: store <2 x i64> [[TMP3]], ptr %ptr -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst1q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], <2 x i64> noundef [[VAL:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: store <2 x i64> [[VAL]], ptr [[PTR]], align 8 +// CHECK-NEXT: ret void +// void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) { return vst1q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 16, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x2_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x2_t @test_vld2_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(16) [[__RET]], i64 16, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT3]], align 8 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X1X2_T]] poison, [2 x <1 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X2_T]] [[TMP1]] +// poly64x1x2_t test_vld2_p64(poly64_t const * ptr) { return vld2_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64> } [[VLD2]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x2_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x2_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x2_t @test_vld2q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD2_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD2_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[VLD2]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD2_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK4:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT3]], align 16 +// CHECK-NEXT: [[DOTUNPACK5:%.*]] = insertvalue [2 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK4]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_POLY64X2X2_T]] poison, [2 x <2 x i64>] [[DOTUNPACK5]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X2_T]] [[TMP1]] +// poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) { return vld2q_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 24, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x3_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x3_t @test_vld3_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(24) [[__RET]], i64 24, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT5]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X1X3_T]] poison, [3 x <1 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X3_T]] [[TMP2]] +// poly64x1x3_t test_vld3_p64(poly64_t const * ptr) { return vld3_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 48, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x3_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x3_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x3_t @test_vld3q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD3_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD3_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD3_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD3_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(48) [[__RET]], i64 48, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [3 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT5:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK6:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT5]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [3 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK6]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[DOTUNPACK9:%.*]] = insertvalue [3 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK8]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [[STRUCT_POLY64X2X3_T]] poison, [3 x <2 x i64>] [[DOTUNPACK9]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X3_T]] [[TMP2]] +// poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) { return vld3q_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr %ptr) -// CHECK: store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[__RET]], i64 32, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x1x4_t, ptr [[RETVAL]], align 8 -// CHECK: ret %struct.poly64x1x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x1x4_t @test_vld4_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 8 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 8 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 8 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 24 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[RETVAL]], ptr noundef nonnull align 8 dereferenceable(32) [[__RET]], i64 32, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <1 x i64>, ptr [[RETVAL]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <1 x i64>] poison, <1 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 8 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT7]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <1 x i64>] [[TMP0]], <1 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT9]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <1 x i64>] [[TMP1]], <1 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 24 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <1 x i64>, ptr [[DOTUNPACK_ELT11]], align 8 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <1 x i64>] [[TMP2]], <1 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X1X4_T]] poison, [4 x <1 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X1X4_T]] [[TMP3]] +// poly64x1x4_t test_vld4_p64(poly64_t const * ptr) { return vld4_p64(ptr); } -// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_p64(ptr noundef %ptr) #0 { -// CHECK: [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr %ptr) -// CHECK: store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], ptr [[__RET]] -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[RETVAL]], ptr align 16 [[__RET]], i64 64, i1 false) -// CHECK: [[TMP6:%.*]] = load %struct.poly64x2x4_t, ptr [[RETVAL]], align 16 -// CHECK: ret %struct.poly64x2x4_t [[TMP6]] +// CHECK-LABEL: define dso_local %struct.poly64x2x4_t @test_vld4q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0(ptr [[PTR]]) +// CHECK-NEXT: [[VLD4_ELT:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 0 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[__RET_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 16 +// CHECK-NEXT: [[VLD4_ELT2:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 1 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT2]], ptr [[__RET_REPACK1]], align 16 +// CHECK-NEXT: [[__RET_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 32 +// CHECK-NEXT: [[VLD4_ELT4:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 2 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT4]], ptr [[__RET_REPACK3]], align 16 +// CHECK-NEXT: [[__RET_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[__RET]], i64 48 +// CHECK-NEXT: [[VLD4_ELT6:%.*]] = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], 3 +// CHECK-NEXT: store <2 x i64> [[VLD4_ELT6]], ptr [[__RET_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[RETVAL]], ptr noundef nonnull align 16 dereferenceable(64) [[__RET]], i64 64, i1 false) +// CHECK-NEXT: [[DOTUNPACK_UNPACK:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [4 x <2 x i64>] poison, <2 x i64> [[DOTUNPACK_UNPACK]], 0 +// CHECK-NEXT: [[DOTUNPACK_ELT7:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 16 +// CHECK-NEXT: [[DOTUNPACK_UNPACK8:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT7]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [4 x <2 x i64>] [[TMP0]], <2 x i64> [[DOTUNPACK_UNPACK8]], 1 +// CHECK-NEXT: [[DOTUNPACK_ELT9:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 32 +// CHECK-NEXT: [[DOTUNPACK_UNPACK10:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT9]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = insertvalue [4 x <2 x i64>] [[TMP1]], <2 x i64> [[DOTUNPACK_UNPACK10]], 2 +// CHECK-NEXT: [[DOTUNPACK_ELT11:%.*]] = getelementptr inbounds nuw i8, ptr [[RETVAL]], i64 48 +// CHECK-NEXT: [[DOTUNPACK_UNPACK12:%.*]] = load <2 x i64>, ptr [[DOTUNPACK_ELT11]], align 16 +// CHECK-NEXT: [[DOTUNPACK13:%.*]] = insertvalue [4 x <2 x i64>] [[TMP2]], <2 x i64> [[DOTUNPACK_UNPACK12]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = insertvalue [[STRUCT_POLY64X2X4_T]] poison, [4 x <2 x i64>] [[DOTUNPACK13]], 0 +// CHECK-NEXT: ret [[STRUCT_POLY64X2X4_T]] [[TMP3]] +// poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) { return vld4q_p64(ptr); } -// CHECK-LABEL: define{{.*}} void @test_vst2_p64(ptr noundef %ptr, [2 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [2 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 16, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [2 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(16) [[__S1]], ptr noundef nonnull align 8 dereferenceable(16) [[VAL]], i64 16, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) { return vst2_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst2q_p64(ptr noundef %ptr, [2 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [2 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 32, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst2q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [2 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [2 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(32) [[__S1]], ptr noundef nonnull align 16 dereferenceable(32) [[VAL]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) { return vst2q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst3_p64(ptr noundef %ptr, [3 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [3 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 24, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8 +// CHECK-NEXT: [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16 +// CHECK-NEXT: [[VAL_COERCE_ELT4:%.*]] = extractvalue [3 x <1 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(24) [[__S1]], ptr noundef nonnull align 8 dereferenceable(24) [[VAL]], i64 24, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) { return vst3_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst3q_p64(ptr noundef %ptr, [3 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [3 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 48, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst3q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [3 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16 +// CHECK-NEXT: [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 32 +// CHECK-NEXT: [[VAL_COERCE_ELT4:%.*]] = extractvalue [3 x <2 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) [[__S1]], ptr noundef nonnull align 16 dereferenceable(48) [[VAL]], i64 48, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st3.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) { return vst3q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst4_p64(ptr noundef %ptr, [4 x <1 x i64>] alignstack(8) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [4 x <1 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[__S1]], ptr align 8 [[VAL]], i64 32, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8> -// CHECK: [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL6]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <1 x i64>, ptr [[ARRAYIDX7]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <1 x i64>] alignstack(8) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 8 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 8 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 8 +// CHECK-NEXT: [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16 +// CHECK-NEXT: [[VAL_COERCE_ELT4:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 8 +// CHECK-NEXT: [[VAL_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 24 +// CHECK-NEXT: [[VAL_COERCE_ELT6:%.*]] = extractvalue [4 x <1 x i64>] [[VAL_COERCE]], 3 +// CHECK-NEXT: store <1 x i64> [[VAL_COERCE_ELT6]], ptr [[VAL_REPACK5]], align 8 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(32) [[__S1]], ptr noundef nonnull align 8 dereferenceable(32) [[VAL]], i64 32, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[ARRAYIDX3]], align 8 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[ARRAYIDX5]], align 8 +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 24 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[ARRAYIDX7]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v1i64.p0(<1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) { return vst4_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} void @test_vst4q_p64(ptr noundef %ptr, [4 x <2 x i64>] alignstack(16) %val.coerce) #0 { -// CHECK: [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[VAL]], i32 0, i32 0 -// CHECK: store [4 x <2 x i64>] [[VAL]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[__S1]], ptr align 16 [[VAL]], i64 64, i1 false) -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL1]], i64 0, i64 0 -// CHECK: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8> -// CHECK: [[VAL2:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL2]], i64 0, i64 1 -// CHECK: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8> -// CHECK: [[VAL4:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL4]], i64 0, i64 2 -// CHECK: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8> -// CHECK: [[VAL6:%.*]] = getelementptr inbounds nuw %struct.poly64x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], ptr [[VAL6]], i64 0, i64 3 -// CHECK: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARRAYIDX7]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64> -// CHECK: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], ptr %ptr) -// CHECK: ret void +// CHECK-LABEL: define dso_local void @test_vst4q_p64( +// CHECK-SAME: ptr noundef [[PTR:%.*]], [4 x <2 x i64>] alignstack(16) [[VAL_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16 +// CHECK-NEXT: [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16 +// CHECK-NEXT: [[VAL_COERCE_ELT:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 0 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT]], ptr [[VAL]], align 16 +// CHECK-NEXT: [[VAL_REPACK1:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 16 +// CHECK-NEXT: [[VAL_COERCE_ELT2:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 1 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT2]], ptr [[VAL_REPACK1]], align 16 +// CHECK-NEXT: [[VAL_REPACK3:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 32 +// CHECK-NEXT: [[VAL_COERCE_ELT4:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 2 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT4]], ptr [[VAL_REPACK3]], align 16 +// CHECK-NEXT: [[VAL_REPACK5:%.*]] = getelementptr inbounds nuw i8, ptr [[VAL]], i64 48 +// CHECK-NEXT: [[VAL_COERCE_ELT6:%.*]] = extractvalue [4 x <2 x i64>] [[VAL_COERCE]], 3 +// CHECK-NEXT: store <2 x i64> [[VAL_COERCE_ELT6]], ptr [[VAL_REPACK5]], align 16 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(64) [[__S1]], ptr noundef nonnull align 16 dereferenceable(64) [[VAL]], i64 64, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX3]], align 16 +// CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARRAYIDX5]], align 16 +// CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[__S1]], i64 48 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARRAYIDX7]], align 16 +// CHECK-NEXT: call void @llvm.aarch64.neon.st4.v2i64.p0(<2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], ptr [[PTR]]) +// CHECK-NEXT: ret void +// void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) { return vst4q_p64(ptr, val); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vext_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vext_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vextq_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vextq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) { return vextq_p64(a, b, 1); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) { return vzip1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vzip2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vzip2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) { return vzip2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) { return vuzp1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vuzp2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vuzp2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) { return vuzp2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn1q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) { return vtrn1q_p64(a, b); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vtrn2q_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) { return vtrn2q_u64(a, b); } -// CHECK-LABEL: define{{.*}} <1 x i64> @test_vsri_n_p64(<1 x i64> noundef %a, <1 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33) -// CHECK: ret <1 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], i32 33) +// CHECK-NEXT: ret <1 x i64> [[VSRI_N2]] +// poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) { return vsri_n_p64(a, b, 33); } -// CHECK-LABEL: define{{.*}} <2 x i64> @test_vsriq_n_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64) -// CHECK: ret <2 x i64> [[VSRI_N2]] +// CHECK-LABEL: define dso_local <2 x i64> @test_vsriq_n_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], i32 64) +// CHECK-NEXT: ret <2 x i64> [[VSRI_N2]] +// poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) { return vsriq_n_p64(a, b, 64); } diff --git a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c index bc985efa6bc99..d98a7da3e3587 100644 --- a/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.1a-neon-intrinsics.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,dce -S | FileCheck %s +// RUN: -target-feature +v8.1a -emit-llvm -disable-O0-optnone -o - %s | opt -passes=mem2reg,instcombine,dce -S | FileCheck %s // REQUIRES: aarch64-registered-target @@ -8,11 +8,9 @@ // CHECK-LABEL: @test_vqrdmlah_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_S163_I]] // int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlah_laneq_s16(a, b, v, 7); @@ -20,11 +18,9 @@ int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqrdmlah_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_S323_I]] // int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlah_laneq_s32(a, b, v, 3); @@ -32,11 +28,9 @@ int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqrdmlahq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_S163_I]] // int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlahq_laneq_s16(a, b, v, 7); @@ -44,11 +38,9 @@ int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqrdmlahq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_S323_I]] // int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlahq_laneq_s32(a, b, v, 3); @@ -78,10 +70,9 @@ int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) { // CHECK-LABEL: @test_vqrdmlahh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) // CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0 // CHECK-NEXT: ret i16 [[TMP3]] @@ -92,7 +83,7 @@ int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) { // CHECK-LABEL: @test_vqrdmlahs_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i64 1 // CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) // CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]] // @@ -102,10 +93,9 @@ int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) { // CHECK-LABEL: @test_vqrdmlahh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[C:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) // CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0 // CHECK-NEXT: ret i16 [[TMP3]] @@ -116,7 +106,7 @@ int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) { // CHECK-LABEL: @test_vqrdmlahs_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 3 // CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) // CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]] // @@ -126,11 +116,9 @@ int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) { // CHECK-LABEL: @test_vqrdmlsh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_S163_I]] // int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { return vqrdmlsh_laneq_s16(a, b, v, 7); @@ -138,11 +126,9 @@ int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqrdmlsh_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> -// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_S323_I]] // int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { return vqrdmlsh_laneq_s32(a, b, v, 3); @@ -150,11 +136,9 @@ int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqrdmlshq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_S163_I]] // int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { return vqrdmlshq_laneq_s16(a, b, v, 7); @@ -162,11 +146,9 @@ int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqrdmlshq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_S323_I]] // int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { return vqrdmlshq_laneq_s32(a, b, v, 3); @@ -196,10 +178,9 @@ int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) { // CHECK-LABEL: @test_vqrdmlshh_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGET_LANE]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) // CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0 // CHECK-NEXT: ret i16 [[TMP3]] @@ -210,7 +191,7 @@ int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) { // CHECK-LABEL: @test_vqrdmlshs_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i64 1 // CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) // CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]] // @@ -220,10 +201,9 @@ int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) { // CHECK-LABEL: @test_vqrdmlshh_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7 // CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[B:%.*]], i64 0 -// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> poison, i16 [[VGETQ_LANE]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[C:%.*]], <8 x i16> poison, <4 x i32> // CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) // CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0 // CHECK-NEXT: ret i16 [[TMP3]] @@ -234,7 +214,7 @@ int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) { // CHECK-LABEL: @test_vqrdmlshs_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 3 // CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) // CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]] // diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c index b51e6f7e6e1ac..13456c08ce9d6 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.c @@ -1,21 +1,13 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=UNCONSTRAINED %s +// RUN: | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefix=UNCONSTRAINED %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=COMMONIR --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ -// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM %s -// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ -// RUN: -ffp-exception-behavior=maytrap -DEXCEPT=1 \ -// RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | llc -o=- - \ -// RUN: | FileCheck --check-prefix=COMMON --check-prefix=CHECK-ASM --implicit-check-not=fpexcept.maytrap %s +// RUN: | opt -S -passes=mem2reg,instcombine \ +// RUN: | FileCheck --check-prefix=CONSTRAINED --implicit-check-not=fpexcept.maytrap %s // REQUIRES: aarch64-registered-target @@ -29,310 +21,418 @@ #include -// COMMON-LABEL: test_vsqrt_f16 -// UNCONSTRAINED: [[SQR:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a) -// CONSTRAINED: [[SQR:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[SQR]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vsqrt_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-NEXT: ret <4 x half> [[VSQRT_I]] +// float16x4_t test_vsqrt_f16(float16x4_t a) { return vsqrt_f16(a); } -// COMMON-LABEL: test_vsqrtq_f16 -// UNCONSTRAINED: [[SQR:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a) -// CONSTRAINED: [[SQR:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fsqrt v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[SQR]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[VSQRT_I]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vsqrtq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[VSQRT_I]] +// float16x8_t test_vsqrtq_f16(float16x8_t a) { return vsqrtq_f16(a); } -// COMMON-LABEL: test_vfma_f16 -// UNCONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// COMMON-LABEL: test_vfmaq_f16 -// UNCONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); } -// COMMON-LABEL: test_vfms_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// UNCONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h -// COMMONIR: ret <4 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); } -// COMMON-LABEL: test_vfmsq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// UNCONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a) -// CONSTRAINED: [[ADD:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h -// COMMONIR: ret <8 x half> [[ADD]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); } -// COMMON-LABEL: test_vfma_lane_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_lane_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmaq_lane_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_lane_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmaq_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfma_laneq_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_laneq_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfma_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfmaq_laneq_f16 -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_laneq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfma_n_f16 -// COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfma_n_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfma_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmaq_n_f16 -// COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3 -// COMMONIR: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4 -// COMMONIR: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5 -// COMMONIR: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6 -// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// UNCONSTRAINED-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmaq_n_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// CONSTRAINED-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmaq_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmah_lane_f16 -// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmah_lane_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP0]] +// float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmah_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmah_laneq_f16 -// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half %b, half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmah_laneq_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[B]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP0]] +// float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmah_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfms_lane_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]]) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_lane_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[FMLA2]] +// float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmsq_lane_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// UNCONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_lane_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// CONSTRAINED-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[FMLA2]] +// float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { return vfmsq_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfms_laneq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK-ASM-NOT: fneg -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_laneq_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfms_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfmsq_laneq_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK-ASM-NOT: fneg -// COMMONIR: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// COMMONIR: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8> -// COMMONIR: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8> -// COMMONIR: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// COMMONIR: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// COMMONIR: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// COMMONIR: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// UNCONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CONSTRAINED: [[FMLA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMLA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_laneq_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_laneq_f16(a, b, c, 7); } -// COMMON-LABEL: test_vfms_n_f16 -// COMMONIR: [[SUB:%.*]] = fneg <4 x half> %b -// COMMONIR: [[TMP0:%.*]] = insertelement <4 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <4 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16( +// UNCONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// UNCONSTRAINED-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <4 x half> @test_vfms_n_f16( +// CONSTRAINED-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// CONSTRAINED-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfms_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmsq_n_f16 -// COMMONIR: [[SUB:%.*]] = fneg <8 x half> %b -// COMMONIR: [[TMP0:%.*]] = insertelement <8 x half> poison, half %c, i32 0 -// COMMONIR: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1 -// COMMONIR: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2 -// COMMONIR: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3 -// COMMONIR: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4 -// COMMONIR: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5 -// COMMONIR: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6 -// COMMONIR: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a) -// CONSTRAINED: [[FMA:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret <8 x half> [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16( +// UNCONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// UNCONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// UNCONSTRAINED-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) +// UNCONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// +// CONSTRAINED-LABEL: define dso_local <8 x half> @test_vfmsq_n_f16( +// CONSTRAINED-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] +// CONSTRAINED-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// CONSTRAINED-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmsq_n_f16(a, b, c); } -// COMMON-LABEL: test_vfmsh_lane_f16 -// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") -// CHECK-ASM: fcvt s{{[0-9]+}}, h{{[0-9]+}} -// COMMONIR: [[TMP1:%.*]] = fneg float [[TMP0]] -// CHECK-ASM: fneg s{{[0-9]+}}, s{{[0-9]+}} -// UNCONSTRAINED: [[SUB:%.*]] = fptrunc float [[TMP1]] to half -// CONSTRAINED: [[SUB:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fcvt h{{[0-9]+}}, s{{[0-9]+}} -// COMMONIR: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fneg half [[B]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP1]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_lane_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP1]] +// float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { return vfmsh_lane_f16(a, b, c, 3); } -// COMMON-LABEL: test_vfmsh_laneq_f16 -// UNCONSTRAINED: [[TMP0:%.*]] = fpext half %b to float -// CONSTRAINED: [[TMP0:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") -// CHECK-ASM: fcvt s{{[0-9]+}}, h{{[0-9]+}} -// COMMONIR: [[TMP1:%.*]] = fneg float [[TMP0]] -// CHECK-ASM: fneg s{{[0-9]+}}, s{{[0-9]+}} -// UNCONSTRAINED: [[SUB:%.*]] = fptrunc float [[TMP1]] to half -// CONSTRAINED: [[SUB:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[TMP1]], metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fcvt h{{[0-9]+}}, s{{[0-9]+}} -// COMMONIR: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7 -// UNCONSTRAINED: [[FMA:%.*]] = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a) -// CONSTRAINED: [[FMA:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[SUB]], half [[EXTR]], half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM: fmla h{{[0-9]+}}, h{{[0-9]+}}, v{{[0-9]+}}.h[{{[0-9]+}}] -// COMMONIR: ret half [[FMA]] +// UNCONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16( +// UNCONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-NEXT: [[TMP0:%.*]] = fneg half [[B]] +// UNCONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 +// UNCONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) +// UNCONSTRAINED-NEXT: ret half [[TMP1]] +// +// CONSTRAINED-LABEL: define dso_local half @test_vfmsh_laneq_f16( +// CONSTRAINED-SAME: half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CONSTRAINED-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-NEXT: [[CONV:%.*]] = call float @llvm.experimental.constrained.fpext.f32.f16(half [[B]], metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] +// CONSTRAINED-NEXT: [[TMP0:%.*]] = call half @llvm.experimental.constrained.fptrunc.f16.f32(float [[FNEG]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 +// CONSTRAINED-NEXT: [[TMP1:%.*]] = call half @llvm.experimental.constrained.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-NEXT: ret half [[TMP1]] +// float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { return vfmsh_laneq_f16(a, b, c, 7); } diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c index 4d2ef318005bd..b616664e395c0 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics-generic.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature -fullfp16 -target-feature +v8a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -15,17 +15,14 @@ // CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <4 x half> [[C]] to <4 x i16> // CHECK-NEXT: [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]] -// CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) -// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]] -// CHECK-NEXT: [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half> -// CHECK-NEXT: ret <4 x half> [[TMP4]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <4 x i16> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <4 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP1]] // float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { return vbsl_f16(a, b, c); @@ -34,17 +31,14 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> +// CHECK-NEXT: [[VBSL1_I:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[VBSL2_I:%.*]] = bitcast <8 x half> [[C]] to <8 x i16> // CHECK-NEXT: [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]] -// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) -// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]] -// CHECK-NEXT: [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]] -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half> -// CHECK-NEXT: ret <8 x half> [[TMP4]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: [[VBSL4_I:%.*]] = and <8 x i16> [[TMP0]], [[VBSL2_I]] +// CHECK-NEXT: [[VBSL5_I:%.*]] = or disjoint <8 x i16> [[VBSL3_I]], [[VBSL4_I]] +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP1]] // float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { return vbslq_f16(a, b, c); @@ -53,21 +47,12 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vzip_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]] // float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { return vzip_f16(a, b); @@ -76,21 +61,12 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VZIP_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VZIP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VZIP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]] // float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { return vzipq_f16(a, b); @@ -99,21 +75,12 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]] // float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { return vuzp_f16(a, b); @@ -122,21 +89,12 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VUZP_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VUZP_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VUZP1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]] // float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { return vuzpq_f16(a, b); @@ -145,21 +103,12 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL_I]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <4 x half>] [[TMP5]], ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], ptr [[RETVAL]], align 8 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <4 x half>] poison, <4 x half> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <4 x half>] [[TMP0]], <4 x half> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X4X2_T:%.*]] poison, [2 x <4 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X4X2_T]] [[TMP1]] // float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { return vtrn_f16(a, b); @@ -168,21 +117,12 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16 -// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VTRN_I]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[RETVAL_I]], i32 1 // CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL_I]], align 16 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], i32 0, i32 0 -// CHECK-NEXT: [[TMP5:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP3]], 0 -// CHECK-NEXT: store [2 x <8 x half>] [[TMP5]], ptr [[TMP4]], align 16 -// CHECK-NEXT: [[TMP6:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], ptr [[RETVAL]], align 16 -// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP6]] +// CHECK-NEXT: [[TMP0:%.*]] = insertvalue [2 x <8 x half>] poison, <8 x half> [[VTRN_I]], 0 +// CHECK-NEXT: [[DOTUNPACK8:%.*]] = insertvalue [2 x <8 x half>] [[TMP0]], <8 x half> [[VTRN1_I]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertvalue [[STRUCT_FLOAT16X8X2_T:%.*]] poison, [2 x <8 x half>] [[DOTUNPACK8]], 0 +// CHECK-NEXT: ret [[STRUCT_FLOAT16X8X2_T]] [[TMP1]] // float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { return vtrnq_f16(a, b); @@ -191,10 +131,8 @@ float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vmov_n_f16(float16_t a) { @@ -204,14 +142,8 @@ float16x4_t test_vmov_n_f16(float16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vmovq_n_f16(float16_t a) { @@ -221,10 +153,8 @@ float16x8_t test_vmovq_n_f16(float16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vdup_n_f16(float16_t a) { @@ -234,14 +164,8 @@ float16x4_t test_vdup_n_f16(float16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16 // CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vdupq_n_f16(float16_t a) { @@ -251,9 +175,7 @@ float16x8_t test_vdupq_n_f16(float16_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> // CHECK-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_lane_f16(float16x4_t a) { @@ -263,9 +185,7 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <8 x i32> // CHECK-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_lane_f16(float16x4_t a) { @@ -275,9 +195,7 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdup_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> // CHECK-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_laneq_f16(float16x8_t a) { @@ -287,9 +205,7 @@ float16x4_t test_vdup_laneq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vdupq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> // CHECK-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_laneq_f16(float16x8_t a) { @@ -299,11 +215,7 @@ float16x8_t test_vdupq_laneq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vext_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> // CHECK-NEXT: ret <4 x half> [[VEXT]] // float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { @@ -313,11 +225,7 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vextq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> // CHECK-NEXT: ret <8 x half> [[VEXT]] // float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { @@ -327,7 +235,7 @@ float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrev64_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> // CHECK-NEXT: ret <4 x half> [[SHUFFLE_I]] // float16x4_t test_vrev64_f16(float16x4_t a) { @@ -337,7 +245,7 @@ float16x4_t test_vrev64_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrev64q_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> // CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] // float16x8_t test_vrev64q_f16(float16x8_t a) { @@ -467,7 +375,7 @@ float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vduph_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[VEC:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i32 7 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i64 7 // CHECK-NEXT: ret half [[VGETQ_LANE]] // float16_t test_vduph_laneq_f16(float16x8_t vec) { @@ -477,7 +385,7 @@ float16_t test_vduph_laneq_f16(float16x8_t vec) { // CHECK-LABEL: define {{[^@]+}}@test_vduph_lane_f16 // CHECK-SAME: (<4 x half> noundef [[VEC:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i32 3 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i64 3 // CHECK-NEXT: ret half [[VGET_LANE]] // float16_t test_vduph_lane_f16(float16x4_t vec) { diff --git a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c index 1cce977b60e6b..ddde409637d58 100644 --- a/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.2a-neon-intrinsics.c @@ -1,7 +1,7 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target @@ -11,7 +11,6 @@ // CHECK-LABEL: define {{[^@]+}}@test_vabs_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VABS1_I]] // @@ -22,7 +21,6 @@ float16x4_t test_vabs_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vabsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VABS1_I]] // @@ -33,9 +31,8 @@ float16x8_t test_vabsq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vceqz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] // uint16x4_t test_vceqz_f16(float16x4_t a) { @@ -45,9 +42,8 @@ uint16x4_t test_vceqz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vceqzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] // uint16x8_t test_vceqzq_f16(float16x8_t a) { @@ -57,9 +53,8 @@ uint16x8_t test_vceqzq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgez_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] // uint16x4_t test_vcgez_f16(float16x4_t a) { @@ -69,9 +64,8 @@ uint16x4_t test_vcgez_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgezq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] // uint16x8_t test_vcgezq_f16(float16x8_t a) { @@ -81,9 +75,8 @@ uint16x8_t test_vcgezq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgtz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] // uint16x4_t test_vcgtz_f16(float16x4_t a) { @@ -93,9 +86,8 @@ uint16x4_t test_vcgtz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcgtzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] // uint16x8_t test_vcgtzq_f16(float16x8_t a) { @@ -105,9 +97,8 @@ uint16x8_t test_vcgtzq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vclez_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] // uint16x4_t test_vclez_f16(float16x4_t a) { @@ -117,9 +108,8 @@ uint16x4_t test_vclez_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vclezq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] // uint16x8_t test_vclezq_f16(float16x8_t a) { @@ -129,9 +119,8 @@ uint16x8_t test_vclezq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcltz_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> // CHECK-NEXT: ret <4 x i16> [[VCLTZ_I]] // uint16x4_t test_vcltz_f16(float16x4_t a) { @@ -141,9 +130,8 @@ uint16x4_t test_vcltz_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcltzq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer -// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> // CHECK-NEXT: ret <8 x i16> [[VCLTZ_I]] // uint16x8_t test_vcltzq_f16(float16x8_t a) { @@ -153,7 +141,6 @@ uint16x8_t test_vcltzq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_s16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half> // CHECK-NEXT: ret <4 x half> [[VCVT_I]] // @@ -164,7 +151,6 @@ float16x4_t test_vcvt_f16_s16 (int16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_s16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half> // CHECK-NEXT: ret <8 x half> [[VCVT_I]] // @@ -175,7 +161,6 @@ float16x8_t test_vcvtq_f16_s16 (int16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_u16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half> // CHECK-NEXT: ret <4 x half> [[VCVT_I]] // @@ -186,7 +171,6 @@ float16x4_t test_vcvt_f16_u16 (uint16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_u16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half> // CHECK-NEXT: ret <8 x half> [[VCVT_I]] // @@ -197,7 +181,6 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] // @@ -208,7 +191,6 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] // @@ -219,7 +201,6 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]] // @@ -230,7 +211,6 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]] // @@ -241,7 +221,6 @@ uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]] // @@ -252,7 +231,6 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]] // @@ -263,7 +241,6 @@ uint16x4_t test_vcvta_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTA1_I]] // @@ -274,7 +251,6 @@ int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]] // @@ -285,7 +261,6 @@ int16x4_t test_vcvtm_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]] // @@ -296,7 +271,6 @@ int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]] // @@ -307,7 +281,6 @@ uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]] // @@ -318,7 +291,6 @@ uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]] // @@ -329,7 +301,6 @@ int16x4_t test_vcvtn_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]] // @@ -340,7 +311,6 @@ int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]] // @@ -351,7 +321,6 @@ uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]] // @@ -362,7 +331,6 @@ uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]] // @@ -373,7 +341,6 @@ int16x4_t test_vcvtp_s16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]] // @@ -384,7 +351,6 @@ int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]] // @@ -395,7 +361,6 @@ uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]] // @@ -427,7 +392,6 @@ float16x8_t test_vnegq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpe_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRECPE_V1_I]] // @@ -438,7 +402,6 @@ float16x4_t test_vrecpe_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpeq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRECPEQ_V1_I]] // @@ -449,7 +412,6 @@ float16x8_t test_vrecpeq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrnd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDZ1_I]] // @@ -460,7 +422,6 @@ float16x4_t test_vrnd_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDZ1_I]] // @@ -471,7 +432,6 @@ float16x8_t test_vrndq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrnda_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDA1_I]] // @@ -482,7 +442,6 @@ float16x4_t test_vrnda_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndaq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDA1_I]] // @@ -493,7 +452,6 @@ float16x8_t test_vrndaq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndi_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDI_V1_I]] // @@ -504,7 +462,6 @@ float16x4_t test_vrndi_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndiq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDIQ_V1_I]] // @@ -515,7 +472,6 @@ float16x8_t test_vrndiq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDM1_I]] // @@ -526,7 +482,6 @@ float16x4_t test_vrndm_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDM1_I]] // @@ -537,7 +492,6 @@ float16x8_t test_vrndmq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndn_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDN1_I]] // @@ -548,7 +502,6 @@ float16x4_t test_vrndn_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndnq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDN1_I]] // @@ -559,7 +512,6 @@ float16x8_t test_vrndnq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndp_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDP1_I]] // @@ -570,7 +522,6 @@ float16x4_t test_vrndp_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndpq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDP1_I]] // @@ -581,7 +532,6 @@ float16x8_t test_vrndpq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndx_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRNDX1_I]] // @@ -592,7 +542,6 @@ float16x4_t test_vrndx_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrndxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRNDX1_I]] // @@ -603,7 +552,6 @@ float16x8_t test_vrndxq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrte_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VRSQRTE_V1_I]] // @@ -614,7 +562,6 @@ float16x4_t test_vrsqrte_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrteq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VRSQRTEQ_V1_I]] // @@ -625,7 +572,6 @@ float16x8_t test_vrsqrteq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vsqrt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> // CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[VSQRT_I]] // @@ -636,7 +582,6 @@ float16x4_t test_vsqrt_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vsqrtq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> // CHECK-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[VSQRT_I]] // @@ -667,8 +612,6 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vabd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VABD2_I]] // @@ -679,8 +622,6 @@ float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vabdq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VABD2_I]] // @@ -691,8 +632,6 @@ float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcage_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x i16> [[VCAGE_V2_I]] // @@ -703,8 +642,6 @@ uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcageq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x i16> [[VCAGEQ_V2_I]] // @@ -715,8 +652,6 @@ uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcagt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x i16> [[VCAGT_V2_I]] // @@ -727,8 +662,6 @@ uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcagtq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x i16> [[VCAGTQ_V2_I]] // @@ -739,8 +672,6 @@ uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcale_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCALE_V2_I]] // @@ -751,8 +682,6 @@ uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcaleq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCALEQ_V2_I]] // @@ -763,8 +692,6 @@ uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcalt_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x i16> [[VCALT_V2_I]] // @@ -775,8 +702,6 @@ uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcaltq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x i16> [[VCALTQ_V2_I]] // @@ -897,9 +822,7 @@ uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_s16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2) // CHECK-NEXT: ret <4 x half> [[VCVT_N1]] // float16x4_t test_vcvt_n_f16_s16(int16x4_t a) { @@ -909,9 +832,7 @@ float16x4_t test_vcvt_n_f16_s16(int16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_s16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2) // CHECK-NEXT: ret <8 x half> [[VCVT_N1]] // float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) { @@ -921,9 +842,7 @@ float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_u16 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2) // CHECK-NEXT: ret <4 x half> [[VCVT_N1]] // float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) { @@ -933,9 +852,7 @@ float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_u16 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2) // CHECK-NEXT: ret <8 x half> [[VCVT_N1]] // float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) { @@ -945,9 +862,7 @@ float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_s16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[A]], i32 2) // CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] // int16x4_t test_vcvt_n_s16_f16(float16x4_t a) { @@ -957,9 +872,7 @@ int16x4_t test_vcvt_n_s16_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_s16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[A]], i32 2) // CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] // int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) { @@ -969,9 +882,7 @@ int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_u16_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[A]], i32 2) // CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] // uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) { @@ -981,9 +892,7 @@ uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_u16_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2) +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[A]], i32 2) // CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] // uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) { @@ -1013,8 +922,6 @@ float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmax_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VMAX2_I]] // @@ -1025,8 +932,6 @@ float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VMAX2_I]] // @@ -1037,8 +942,6 @@ float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VMAXNM2_I]] // @@ -1049,8 +952,6 @@ float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VMAXNM2_I]] // @@ -1061,8 +962,6 @@ float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmin_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VMIN2_I]] // @@ -1073,8 +972,6 @@ float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VMIN2_I]] // @@ -1085,8 +982,6 @@ float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VMINNM2_I]] // @@ -1097,8 +992,6 @@ float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VMINNM2_I]] // @@ -1129,8 +1022,6 @@ float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // @@ -1141,8 +1032,6 @@ float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // @@ -1153,10 +1042,7 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpadd_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) -// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8> // CHECK-NEXT: ret <4 x half> [[VPADD_V2_I]] // float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { @@ -1166,10 +1052,7 @@ float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpaddq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) -// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8> // CHECK-NEXT: ret <8 x half> [[VPADDQ_V2_I]] // float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) { @@ -1179,8 +1062,6 @@ float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmax_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VPMAX2_I]] // @@ -1191,8 +1072,6 @@ float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VPMAX2_I]] // @@ -1203,8 +1082,6 @@ float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VPMAXNM2_I]] // @@ -1215,8 +1092,6 @@ float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmaxnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VPMAXNM2_I]] // @@ -1227,8 +1102,6 @@ float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpmin_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VPMIN2_I]] // @@ -1239,8 +1112,6 @@ float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VPMIN2_I]] // @@ -1251,8 +1122,6 @@ float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminnm_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) // CHECK-NEXT: ret <4 x half> [[VPMINNM2_I]] // @@ -1263,8 +1132,6 @@ float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vpminnmq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) // CHECK-NEXT: ret <8 x half> [[VPMINNM2_I]] // @@ -1275,10 +1142,7 @@ float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrecps_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) -// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8> // CHECK-NEXT: ret <4 x half> [[VRECPS_V2_I]] // float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { @@ -1288,10 +1152,7 @@ float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrecpsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) -// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8> // CHECK-NEXT: ret <8 x half> [[VRECPSQ_V2_I]] // float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { @@ -1301,10 +1162,7 @@ float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrts_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) -// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8> // CHECK-NEXT: ret <4 x half> [[VRSQRTS_V2_I]] // float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { @@ -1314,10 +1172,7 @@ float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vrsqrtsq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) -// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8> // CHECK-NEXT: ret <8 x half> [[VRSQRTSQ_V2_I]] // float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) { @@ -1347,11 +1202,8 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); @@ -1360,11 +1212,8 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); @@ -1374,11 +1223,8 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); @@ -1388,11 +1234,8 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); @@ -1401,14 +1244,8 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[LANE]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[FMLA2]] // float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { @@ -1418,14 +1255,8 @@ float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[LANE]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[FMLA2]] // float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { @@ -1435,15 +1266,9 @@ float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CHECK-NEXT: ret <4 x half> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[B]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfma_laneq_f16(a, b, c, 7); @@ -1452,15 +1277,9 @@ float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CHECK-NEXT: ret <8 x half> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[B]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_laneq_f16(a, b, c, 7); @@ -1469,15 +1288,10 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfma_n_f16(a, b, c); @@ -1486,19 +1300,10 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmaq_n_f16(a, b, c); @@ -1507,7 +1312,7 @@ float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 // CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -1518,7 +1323,7 @@ float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 // CHECK-NEXT: [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]]) // CHECK-NEXT: ret half [[TMP0]] // @@ -1530,14 +1335,8 @@ float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) { // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <4 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[LANE]], <4 x half> [[A]]) // CHECK-NEXT: ret <4 x half> [[FMLA2]] // float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { @@ -1548,14 +1347,8 @@ float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> -// CHECK-NEXT: [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]]) +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[C]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[LANE]], <8 x half> [[A]]) // CHECK-NEXT: ret <8 x half> [[FMLA2]] // float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { @@ -1566,15 +1359,9 @@ float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) { // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]]) -// CHECK-NEXT: ret <4 x half> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[FNEG]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { return vfms_laneq_f16(a, b, c, 7); @@ -1584,15 +1371,9 @@ float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) { // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> -// CHECK-NEXT: [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]]) -// CHECK-NEXT: ret <8 x half> [[TMP6]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[C]], <8 x half> poison, <8 x i32> +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[FNEG]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_laneq_f16(a, b, c, 7); @@ -1602,15 +1383,10 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <4 x half> [[B]] -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) -// CHECK-NEXT: ret <4 x half> [[TMP3]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] // float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { return vfms_n_f16(a, b, c); @@ -1620,19 +1396,10 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) { // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[FNEG:%.*]] = fneg <8 x half> [[B]] -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) -// CHECK-NEXT: ret <8 x half> [[TMP3]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] // float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { return vfmsq_n_f16(a, b, c); @@ -1641,10 +1408,8 @@ float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmsh_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[CONV:%.*]] = fpext half [[B]] to float -// CHECK-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] -// CHECK-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3 +// CHECK-NEXT: [[TMP0:%.*]] = fneg half [[B]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i64 3 // CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) // CHECK-NEXT: ret half [[TMP1]] // @@ -1655,10 +1420,8 @@ float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vfmsh_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[CONV:%.*]] = fpext half [[B]] to float -// CHECK-NEXT: [[FNEG:%.*]] = fneg float [[CONV]] -// CHECK-NEXT: [[TMP0:%.*]] = fptrunc float [[FNEG]] to half -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = fneg half [[B]] +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i64 7 // CHECK-NEXT: [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]]) // CHECK-NEXT: ret half [[TMP1]] // @@ -1669,9 +1432,7 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { // CHECK-LABEL: define {{[^@]+}}@test_vmul_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <4 x half> [[MUL]] // @@ -1682,9 +1443,7 @@ float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <8 x half> [[MUL]] // @@ -1695,9 +1454,7 @@ float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmul_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <4 x half> [[MUL]] // @@ -1708,9 +1465,7 @@ float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> // CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] // CHECK-NEXT: ret <8 x half> [[MUL]] // @@ -1721,10 +1476,8 @@ float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmul_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]] // CHECK-NEXT: ret <4 x half> [[MUL]] // @@ -1735,14 +1488,8 @@ float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]] // CHECK-NEXT: ret <8 x half> [[MUL]] // @@ -1754,18 +1501,9 @@ float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_847:%.*]] = alloca <4 x half>, align 8 -// CHECK-NEXT: [[__REINT1_847:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <4 x half> [[B]], ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_847]], align 8 -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP0]], i32 3 -// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_847]], align 2 -// CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP1]] to float -// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] -// CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half -// CHECK-NEXT: ret half [[TMP2]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = extractelement <4 x half> [[B]], i64 3 +// CHECK-NEXT: [[TMP0:%.*]] = fmul half [[A]], [[DOTCAST1]] +// CHECK-NEXT: ret half [[TMP0]] // float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) { return vmulh_lane_f16(a, b, 3); @@ -1774,18 +1512,9 @@ float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_850:%.*]] = alloca <8 x half>, align 16 -// CHECK-NEXT: [[__REINT1_850:%.*]] = alloca i16, align 2 -// CHECK-NEXT: [[CONV:%.*]] = fpext half [[A]] to float -// CHECK-NEXT: store <8 x half> [[B]], ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[__REINT_850]], align 16 -// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 -// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[__REINT1_850]], align 2 -// CHECK-NEXT: [[CONV2:%.*]] = fpext half [[TMP1]] to float -// CHECK-NEXT: [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]] -// CHECK-NEXT: [[TMP2:%.*]] = fptrunc float [[MUL]] to half -// CHECK-NEXT: ret half [[TMP2]] +// CHECK-NEXT: [[DOTCAST1:%.*]] = extractelement <8 x half> [[B]], i64 7 +// CHECK-NEXT: [[TMP0:%.*]] = fmul half [[A]], [[DOTCAST1]] +// CHECK-NEXT: ret half [[TMP0]] // float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { return vmulh_laneq_f16(a, b, 7); @@ -1794,11 +1523,7 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_lane_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // @@ -1809,11 +1534,7 @@ float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_lane_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // @@ -1824,11 +1545,7 @@ float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_laneq_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <4 x i32> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // @@ -1839,11 +1556,7 @@ float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_laneq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[B]], <8 x half> poison, <8 x i32> // CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // @@ -1854,12 +1567,8 @@ float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulx_n_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8> +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) // CHECK-NEXT: ret <4 x half> [[VMULX2_I]] // @@ -1870,16 +1579,8 @@ float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxq_n_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8> +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) // CHECK-NEXT: ret <8 x half> [[VMULX2_I]] // @@ -1890,7 +1591,7 @@ float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxh_lane_f16 // CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[B]], i32 3 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x half> [[B]], i64 3 // CHECK-NEXT: [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]]) // CHECK-NEXT: ret half [[VMULX]] // @@ -1901,7 +1602,7 @@ float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmulxh_laneq_f16 // CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[B]], i32 7 +// CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <8 x half> [[B]], i64 7 // CHECK-NEXT: [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]]) // CHECK-NEXT: ret half [[VMULX]] // @@ -1912,9 +1613,7 @@ float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[VMAXV]]) +// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret half [[VMAXV1]] // float16_t test_vmaxv_f16(float16x4_t a) { @@ -1924,9 +1623,7 @@ float16_t test_vmaxv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[VMAXV]]) +// CHECK-NEXT: [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret half [[VMAXV1]] // float16_t test_vmaxvq_f16(float16x8_t a) { @@ -1936,9 +1633,7 @@ float16_t test_vmaxvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[VMINV]]) +// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret half [[VMINV1]] // float16_t test_vminv_f16(float16x4_t a) { @@ -1948,9 +1643,7 @@ float16_t test_vminv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[VMINV]]) +// CHECK-NEXT: [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret half [[VMINV1]] // float16_t test_vminvq_f16(float16x8_t a) { @@ -1960,9 +1653,7 @@ float16_t test_vminvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[VMAXNMV]]) +// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret half [[VMAXNMV1]] // float16_t test_vmaxnmv_f16(float16x4_t a) { @@ -1972,9 +1663,7 @@ float16_t test_vmaxnmv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[VMAXNMV]]) +// CHECK-NEXT: [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret half [[VMAXNMV1]] // float16_t test_vmaxnmvq_f16(float16x8_t a) { @@ -1984,9 +1673,7 @@ float16_t test_vmaxnmvq_f16(float16x8_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmv_f16 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[VMINNMV]]) +// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[A]]) // CHECK-NEXT: ret half [[VMINNMV1]] // float16_t test_vminnmv_f16(float16x4_t a) { @@ -1996,9 +1683,7 @@ float16_t test_vminnmv_f16(float16x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f16 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-NEXT: [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[VMINNMV]]) +// CHECK-NEXT: [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[A]]) // CHECK-NEXT: ret half [[VMINNMV1]] // float16_t test_vminnmvq_f16(float16x8_t a) { diff --git a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c index c44dd333c9754..6ddf298d0dc6b 100644 --- a/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c +++ b/clang/test/CodeGen/AArch64/v8.5a-neon-frint3264-intrinsic.c @@ -1,120 +1,169 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v8.5a\ // RUN: -flax-vector-conversions=none -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: test_vrnd32x_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32x_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32x.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRND32X_F321_I]] +// float32x2_t test_vrnd32x_f32(float32x2_t a) { return vrnd32x_f32(a); } -// CHECK-LABEL: test_vrnd32xq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32xq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32x.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRND32XQ_F321_I]] +// float32x4_t test_vrnd32xq_f32(float32x4_t a) { return vrnd32xq_f32(a); } -// CHECK-LABEL: test_vrnd32z_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd32z_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint32z.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRND32Z_F321_I]] +// float32x2_t test_vrnd32z_f32(float32x2_t a) { return vrnd32z_f32(a); } -// CHECK-LABEL: test_vrnd32zq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd32zq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint32z.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRND32ZQ_F321_I]] +// float32x4_t test_vrnd32zq_f32(float32x4_t a) { return vrnd32zq_f32(a); } -// CHECK-LABEL: test_vrnd64x_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64x_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64X_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64x.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRND64X_F321_I]] +// float32x2_t test_vrnd64x_f32(float32x2_t a) { return vrnd64x_f32(a); } -// CHECK-LABEL: test_vrnd64xq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64xq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64XQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64x.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRND64XQ_F321_I]] +// float32x4_t test_vrnd64xq_f32(float32x4_t a) { return vrnd64xq_f32(a); } -// CHECK-LABEL: test_vrnd64z_f32 -// CHECK: [[RND:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[RND]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrnd64z_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64Z_F321_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frint64z.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRND64Z_F321_I]] +// float32x2_t test_vrnd64z_f32(float32x2_t a) { return vrnd64z_f32(a); } -// CHECK-LABEL: test_vrnd64zq_f32 -// CHECK: [[RND:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[RND]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrnd64zq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64ZQ_F321_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frint64z.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRND64ZQ_F321_I]] +// float32x4_t test_vrnd64zq_f32(float32x4_t a) { return vrnd64zq_f32(a); } -// CHECK-LABEL: test_vrnd32x_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32x_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32x.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRND32X_F641_I]] +// float64x1_t test_vrnd32x_f64(float64x1_t a) { return vrnd32x_f64(a); } -// CHECK-LABEL: test_vrnd32xq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32xq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32x.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRND32XQ_F641_I]] +// float64x2_t test_vrnd32xq_f64(float64x2_t a) { return vrnd32xq_f64(a); } -// CHECK-LABEL: test_vrnd32z_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd32z_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint32z.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRND32Z_F641_I]] +// float64x1_t test_vrnd32z_f64(float64x1_t a) { return vrnd32z_f64(a); } -// CHECK-LABEL: test_vrnd32zq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd32zq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND32ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint32z.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRND32ZQ_F641_I]] +// float64x2_t test_vrnd32zq_f64(float64x2_t a) { return vrnd32zq_f64(a); } -// CHECK-LABEL: test_vrnd64x_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64x_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64X_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64x.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRND64X_F641_I]] +// float64x1_t test_vrnd64x_f64(float64x1_t a) { return vrnd64x_f64(a); } -// CHECK-LABEL: test_vrnd64xq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64xq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64XQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64x.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRND64XQ_F641_I]] +// float64x2_t test_vrnd64xq_f64(float64x2_t a) { return vrnd64xq_f64(a); } -// CHECK-LABEL: test_vrnd64z_f64 -// CHECK: [[RND:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> %a) -// CHECK: ret <1 x double> [[RND]] +// CHECK-LABEL: define dso_local <1 x double> @test_vrnd64z_f64( +// CHECK-SAME: <1 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64Z_F641_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frint64z.v1f64(<1 x double> [[A]]) +// CHECK-NEXT: ret <1 x double> [[VRND64Z_F641_I]] +// float64x1_t test_vrnd64z_f64(float64x1_t a) { return vrnd64z_f64(a); } -// CHECK-LABEL: test_vrnd64zq_f64 -// CHECK: [[RND:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> %a) -// CHECK: ret <2 x double> [[RND]] +// CHECK-LABEL: define dso_local <2 x double> @test_vrnd64zq_f64( +// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND64ZQ_F641_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frint64z.v2f64(<2 x double> [[A]]) +// CHECK-NEXT: ret <2 x double> [[VRND64ZQ_F641_I]] +// float64x2_t test_vrnd64zq_f64(float64x2_t a) { return vrnd64zq_f64(a); } diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c index 7bfeb7939edb2..8bfe2cffff58d 100644 --- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c @@ -1,147 +1,163 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | opt -S -passes=mem2reg,instcombine,sroa \ // RUN: | FileCheck %s // REQUIRES: aarch64-registered-target #include -// CHECK-LABEL: test_vmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { return vmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vmmlaq_u32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } -// CHECK-LABEL: test_vusmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSMMLA1_I]] +// int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_s32 -// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) -// CHECK: ret <2 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP0]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { return vsudot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdot_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_laneq_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { return vusdot_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_laneq_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP0]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { return vsudot_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_s32(r, a, b); } -// CHECK-LABEL: test_vusdotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP0]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { return vusdotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[DOTCAST]], <2 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP0]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { return vsudotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_laneq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP0]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusdotq_laneq_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_laneq_s32 -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_laneq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <16 x i8> [[B]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[DOTCAST]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP0]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { return vsudotq_laneq_s32(r, a, b, 0); } diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c index 51aa5aa758f0c..e6e486efbbb69 100644 --- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -2,19 +2,19 @@ // RUN: %clang_cc1 \ // RUN: -triple aarch64 -target-feature +neon -target-feature +bf16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A64 %s // RUN: %clang_cc1 \ // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ // RUN: -target-feature +bf16 -mfloat-abi hard \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-HARDFP %s // RUN: %clang_cc1 \ // RUN: -triple armv8.6a-arm-none-eabi -target-feature +neon \ // RUN: -target-feature +bf16 -mfloat-abi softfp \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck --check-prefixes=CHECK,CHECK-A32-SOFTFP %s // REQUIRES: arm-registered-target @@ -24,51 +24,27 @@ // CHECK-A64-LABEL: @test_vcvt_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 16 -// CHECK-A64-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A64-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[A:%.*]], ptr [[__REINT_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <4 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP0]], ptr [[__P0_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I]], ptr [[__REINT_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl <4 x i32> [[TMP3]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I]], ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[__REINT1_808_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP4]] +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <2 x i32> [[A_COERCE:%.*]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[DOTCAST1]] // float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { return vcvt_f32_bf16(a); @@ -76,72 +52,31 @@ float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16 -// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A64-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]] +// CHECK-A32-SOFTFP-NEXT: [[__P01_I5_CAST:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5_CAST]], <8 x bfloat> poison, <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[DOTCAST1]] // float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { return vcvtq_low_f32_bf16(a); @@ -149,72 +84,31 @@ float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A64-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 16 -// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A64-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A64-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A64-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 16 -// CHECK-A64-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A64-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A64-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A64-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A64-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-HARDFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> -// CHECK-A32-HARDFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK-A32-HARDFP-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP0]] to <4 x i32> -// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 16) -// CHECK-A32-HARDFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[TMP3]] +// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-HARDFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-HARDFP-NEXT: ret <4 x float> [[DOTCAST1]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_f32_bf16( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT_808_I_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__REINT1_808_I_I:%.*]] = alloca <4 x i32>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE3_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[A:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[A_COERCE:%.*]], ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[A1:%.*]] = load <8 x bfloat>, ptr [[A]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[A1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP1]], ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I5:%.*]] = load <8 x bfloat>, ptr [[__P0_I2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5]], <8 x bfloat> [[__P01_I5]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE3_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_8081_I_I:%.*]] = load <4 x bfloat>, ptr [[__P0_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[__P0_8081_I_I]], ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__REINT_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> -// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl <4 x i32> [[TMP7]], splat (i32 16) -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[VSHLL_N_I_I]], ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[__REINT1_808_I_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[TMP8]] +// CHECK-A32-SOFTFP-NEXT: [[__P01_I5_CAST:%.*]] = bitcast <4 x i32> [[A_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I5_CAST]], <8 x bfloat> poison, <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[SHUFFLE_I]] to <4 x i16> +// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[DOTCAST]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[VSHLL_N_I_I:%.*]] = shl nuw <4 x i32> [[TMP0]], splat (i32 16) +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x i32> [[VSHLL_N_I_I]] to <4 x float> +// CHECK-A32-SOFTFP-NEXT: ret <4 x float> [[DOTCAST1]] // float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { return vcvtq_high_f32_bf16(a); @@ -222,39 +116,20 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvt_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8> -// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> +// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A:%.*]]) +// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> poison, <4 x i32> // CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]]) // CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x bfloat>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[TMP6]] +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]]) +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <2 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <2 x i32> [[DOTCAST]] // bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { return vcvt_bf16_f32(a); @@ -262,58 +137,22 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8> +// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A:%.*]]) // CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]]) // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I4:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I1:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[RETVAL_I1]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP2]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> zeroinitializer, ptr [[COERCE1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[COERCE1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP3]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP4]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <4 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP5]], ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> [[__P01_I]], <4 x bfloat> [[__P12_I]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <4 x i32>, ptr [[RETVAL_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <8 x bfloat>, ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP7]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP8]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <8 x bfloat>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP9]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP10]] +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]]) +// CHECK-A32-SOFTFP-NEXT: [[__P12_I_CAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[__P12_I_CAST]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[DOTCAST]] // bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { return vcvtq_low_bf16_f32(a); @@ -321,82 +160,25 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]]) -// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8> +// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE:%.*]], <4 x float> [[A:%.*]]) // CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: -// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) -// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> +// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A:%.*]]) +// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> // CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]] // // CHECK-A32-SOFTFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-SOFTFP-NEXT: entry: -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I11:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I12:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P1_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I8:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I3:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I4:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P0_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE4_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE5_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE6_I:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE8_I:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[INACTIVE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: [[COERCE2:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[INACTIVE_COERCE:%.*]], ptr [[INACTIVE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[INACTIVE1:%.*]] = load <8 x bfloat>, ptr [[INACTIVE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[INACTIVE1]], ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[COERCE]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP0]], ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, ptr [[__P0_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) -// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP2]], ptr [[RETVAL_I8]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[RETVAL_I8]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP3]], ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = load <4 x bfloat>, ptr [[COERCE_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[__P01_I]], ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = load <4 x i32>, ptr [[COERCE2_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP5]], ptr [[__P0_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I7:%.*]] = load <8 x bfloat>, ptr [[__P0_I4]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7]], <8 x bfloat> [[__P01_I7]], <4 x i32> -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[SHUFFLE_I]], ptr [[RETVAL_I3]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[RETVAL_I3]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP6]], ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP7:%.*]] = load <4 x bfloat>, ptr [[COERCE4_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP4]], ptr [[COERCE5_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[COERCE5_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP7]], ptr [[COERCE6_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[COERCE6_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP8]], ptr [[__P0_I12]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P01_I16:%.*]] = load <4 x bfloat>, ptr [[__P0_I12]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <2 x i32> [[TMP9]], ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[__P12_I:%.*]] = load <4 x bfloat>, ptr [[__P1_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16]], <4 x bfloat> [[__P12_I]], <8 x i32> -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[SHUFFLE_I17]], ptr [[RETVAL_I11]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP10:%.*]] = load <4 x i32>, ptr [[RETVAL_I11]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP10]], ptr [[COERCE8_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP11:%.*]] = load <8 x bfloat>, ptr [[COERCE8_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP11]], ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP12:%.*]] = load <4 x i32>, ptr [[RETVAL_I]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP12]], ptr [[COERCE2]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP13:%.*]] = load <8 x bfloat>, ptr [[COERCE2]], align 8 -// CHECK-A32-SOFTFP-NEXT: store <8 x bfloat> [[TMP13]], ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: [[TMP14:%.*]] = load <4 x i32>, ptr [[RETVAL]], align 8 -// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[TMP14]] +// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A:%.*]]) +// CHECK-A32-SOFTFP-NEXT: [[__P01_I7_CAST:%.*]] = bitcast <4 x i32> [[INACTIVE_COERCE:%.*]] to <8 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__P01_I7_CAST]], <8 x bfloat> poison, <4 x i32> +// CHECK-A32-SOFTFP-NEXT: [[__P01_I16_CAST:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat> +// CHECK-A32-SOFTFP-NEXT: [[SHUFFLE_I17:%.*]] = shufflevector <4 x bfloat> [[__P01_I16_CAST]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> +// CHECK-A32-SOFTFP-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[SHUFFLE_I17]] to <4 x i32> +// CHECK-A32-SOFTFP-NEXT: ret <4 x i32> [[DOTCAST]] // bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { return vcvtq_high_bf16_f32(inactive, a); @@ -423,15 +205,11 @@ bfloat16_t test_vcvth_bf16_f32(float32_t a) { // CHECK-LABEL: @test_vcvtah_f32_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_I:%.*]] = alloca bfloat, align 2 -// CHECK-NEXT: [[__REINT1_I:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[__REINT_I]], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[__REINT_I]], align 2 -// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[TMP0]] to i32 -// CHECK-NEXT: [[SHL_I:%.*]] = shl i32 [[CONV_I]], 16 -// CHECK-NEXT: store i32 [[SHL_I]], ptr [[__REINT1_I]], align 4 -// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[__REINT1_I]], align 4 -// CHECK-NEXT: ret float [[TMP1]] +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast bfloat [[A:%.*]] to i16 +// CHECK-NEXT: [[CONV_I:%.*]] = zext i16 [[DOTCAST]] to i32 +// CHECK-NEXT: [[SHL_I:%.*]] = shl nuw i32 [[CONV_I]], 16 +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast i32 [[SHL_I]] to float +// CHECK-NEXT: ret float [[DOTCAST1]] // float32_t test_vcvtah_f32_bf16(bfloat16_t a) { return vcvtah_f32_bf16(a); diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c index c4f0b78fc6a57..b357042843162 100644 --- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c @@ -2,11 +2,11 @@ // RUN: %clang_cc1 -triple armv8-arm-none-eabi \ // RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi soft \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s // RUN: %clang_cc1 -triple armv8-arm-none-eabi \ // RUN: -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -14,10 +14,7 @@ // CHECK-LABEL: @test_vbfdot_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { @@ -26,10 +23,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) { // CHECK-LABEL: @test_vbfdotq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ @@ -38,19 +32,10 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){ // CHECK-LABEL: @test_vbfdot_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_128:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_128:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_128]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_128]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ @@ -59,19 +44,10 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){ // CHECK-LABEL: @test_vbfdotq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_130:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_130:%.*]] = alloca <4 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_130]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_130]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <4 x i32> -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_130]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_130]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -80,19 +56,10 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b // CHECK-LABEL: @test_vbfdot_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_132:%.*]] = alloca <8 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_132:%.*]] = alloca <2 x float>, align 8 -// CHECK-NEXT: store <8 x bfloat> [[B:%.*]], ptr [[__REINT_132]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[__REINT_132]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP2]], <2 x i32> -// CHECK-NEXT: store <2 x float> [[LANE]], ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <4 x bfloat>, ptr [[__REINT1_132]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[TMP3]] to <8 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]] // float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) { @@ -101,19 +68,10 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) // CHECK-LABEL: @test_vbfdotq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[__REINT_126:%.*]] = alloca <4 x bfloat>, align 8 -// CHECK-NEXT: [[__REINT1_126:%.*]] = alloca <4 x float>, align 8 -// CHECK-NEXT: store <4 x bfloat> [[B:%.*]], ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[__REINT_126]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[TMP0]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <4 x i32> zeroinitializer -// CHECK-NEXT: store <4 x float> [[LANE]], ptr [[__REINT1_126]], align 8 -// CHECK-NEXT: [[TMP3:%.*]] = load <8 x bfloat>, ptr [[__REINT1_126]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[TMP3]] to <16 x i8> -// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP3]]) +// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat> +// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]]) // CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]] // float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -122,11 +80,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) // CHECK-LABEL: @test_vbfmmlaq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMMLAQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMMLAQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_F323_I]] // float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -135,11 +89,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -148,11 +98,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlaltq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -161,27 +107,8 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { // CHECK-LABEL: @test_vbfmlalbq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -190,27 +117,8 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlalbq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALBQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> +// CHECK-NEXT: [[VBFMLALBQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_F323_I]] // float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { @@ -219,27 +127,8 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t // CHECK-LABEL: @test_vbfmlaltq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[B:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <4 x bfloat> [[B]], i32 0 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <4 x bfloat> [[B:%.*]], <4 x bfloat> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) { @@ -248,27 +137,8 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t // CHECK-LABEL: @test_vbfmlaltq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[B:%.*]], i32 3 -// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x bfloat> poison, bfloat [[VGET_LANE]], i32 0 -// CHECK-NEXT: [[VGET_LANE3:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT5:%.*]] = insertelement <8 x bfloat> [[VECINIT]], bfloat [[VGET_LANE3]], i32 1 -// CHECK-NEXT: [[VGET_LANE8:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <8 x bfloat> [[VECINIT5]], bfloat [[VGET_LANE8]], i32 2 -// CHECK-NEXT: [[VGET_LANE13:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT15:%.*]] = insertelement <8 x bfloat> [[VECINIT10]], bfloat [[VGET_LANE13]], i32 3 -// CHECK-NEXT: [[VGET_LANE18:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT20:%.*]] = insertelement <8 x bfloat> [[VECINIT15]], bfloat [[VGET_LANE18]], i32 4 -// CHECK-NEXT: [[VGET_LANE23:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT25:%.*]] = insertelement <8 x bfloat> [[VECINIT20]], bfloat [[VGET_LANE23]], i32 5 -// CHECK-NEXT: [[VGET_LANE28:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT30:%.*]] = insertelement <8 x bfloat> [[VECINIT25]], bfloat [[VGET_LANE28]], i32 6 -// CHECK-NEXT: [[VGET_LANE33:%.*]] = extractelement <8 x bfloat> [[B]], i32 3 -// CHECK-NEXT: [[VECINIT35:%.*]] = insertelement <8 x bfloat> [[VECINIT30]], bfloat [[VGET_LANE33]], i32 7 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8> -// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) -// CHECK-NEXT: [[VBFMLALTQ_F324_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_F323_I]] to <16 x i8> +// CHECK-NEXT: [[VECINIT35:%.*]] = shufflevector <8 x bfloat> [[B:%.*]], <8 x bfloat> poison, <8 x i32> +// CHECK-NEXT: [[VBFMLALTQ_F323_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[VECINIT35]]) // CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_F323_I]] // float32x4_t test_vbfmlaltq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) { diff --git a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c index b87d0e8eb68bb..6ef05790b0217 100644 --- a/clang/test/CodeGen/arm-bf16-getset-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-getset-intrinsics.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi soft \ -// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -19,10 +19,8 @@ bfloat16x4_t test_vcreate_bf16(uint64_t a) { // CHECK-LABEL: @test_vdup_n_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x bfloat> poison, bfloat [[V:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x bfloat> [[VECINIT_I]], <4 x bfloat> poison, <4 x i32> zeroinitializer // CHECK-NEXT: ret <4 x bfloat> [[VECINIT3_I]] // bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { @@ -31,14 +29,8 @@ bfloat16x4_t test_vdup_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdupq_n_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x bfloat> [[VECINIT_I]], bfloat [[V]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x bfloat> [[VECINIT1_I]], bfloat [[V]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x bfloat> [[VECINIT2_I]], bfloat [[V]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x bfloat> [[VECINIT3_I]], bfloat [[V]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x bfloat> [[VECINIT4_I]], bfloat [[V]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x bfloat> [[VECINIT5_I]], bfloat [[V]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x bfloat> [[VECINIT6_I]], bfloat [[V]], i32 7 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x bfloat> poison, bfloat [[V:%.*]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x bfloat> [[VECINIT_I]], <8 x bfloat> poison, <8 x i32> zeroinitializer // CHECK-NEXT: ret <8 x bfloat> [[VECINIT7_I]] // bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { @@ -47,9 +39,7 @@ bfloat16x8_t test_vdupq_n_bf16(bfloat16_t v) { // CHECK-LABEL: @test_vdup_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { @@ -58,9 +48,7 @@ bfloat16x4_t test_vdup_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdupq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[V:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[V:%.*]], <4 x bfloat> poison, <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { @@ -69,9 +57,7 @@ bfloat16x8_t test_vdupq_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vdup_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[LANE]] // bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { @@ -80,9 +66,7 @@ bfloat16x4_t test_vdup_laneq_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vdupq_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[V:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x bfloat> -// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[V:%.*]], <8 x bfloat> poison, <8 x i32> // CHECK-NEXT: ret <8 x bfloat> [[LANE]] // bfloat16x8_t test_vdupq_laneq_bf16(bfloat16x8_t v) { @@ -100,7 +84,7 @@ bfloat16x8_t test_vcombine_bf16(bfloat16x4_t low, bfloat16x4_t high) { // CHECK-LABEL: @test_vget_high_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { @@ -109,7 +93,7 @@ bfloat16x4_t test_vget_high_bf16(bfloat16x8_t a) { // CHECK-LABEL: @test_vget_low_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> [[A]], <4 x i32> +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[A:%.*]], <8 x bfloat> poison, <4 x i32> // CHECK-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] // bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { @@ -118,7 +102,7 @@ bfloat16x4_t test_vget_low_bf16(bfloat16x8_t a) { // CHECK-LABEL: @test_vget_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { @@ -127,7 +111,7 @@ bfloat16_t test_vget_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vgetq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { @@ -136,7 +120,7 @@ bfloat16_t test_vgetq_lane_bf16(bfloat16x8_t v) { // CHECK-LABEL: @test_vset_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 1 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 1 // CHECK-NEXT: ret <4 x bfloat> [[VSET_LANE]] // bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { @@ -145,7 +129,7 @@ bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) { // CHECK-LABEL: @test_vsetq_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i32 7 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x bfloat> [[V:%.*]], bfloat [[A:%.*]], i64 7 // CHECK-NEXT: ret <8 x bfloat> [[VSET_LANE]] // bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { @@ -154,7 +138,7 @@ bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) { // CHECK-LABEL: @test_vduph_lane_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i64 1 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { @@ -163,7 +147,7 @@ bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) { // CHECK-LABEL: @test_vduph_laneq_bf16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i64 7 // CHECK-NEXT: ret bfloat [[VGET_LANE]] // bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) { diff --git a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c index 11de8ba1dab7a..45923d9343c3f 100644 --- a/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c +++ b/clang/test/CodeGen/arm-neon-directed-rounding-constrained.c @@ -1,63 +1,77 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=UNCONSTRAINED-A32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,UNCONSTRAINED %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=UNCONSTRAINED-A64 %s // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffp-exception-behavior=strict \ // RUN: -fexperimental-strict-floating-point \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CONSTRAINED-A32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffp-exception-behavior=strict \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=COMMON,COMMONIR,CONSTRAINED %s - -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ -// RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s -// RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ -// RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s - -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -fexperimental-strict-floating-point \ -// RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM32 %s -// RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ -// RUN: -ffp-exception-behavior=strict \ -// RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | llc -o=- - | FileCheck -check-prefixes=COMMON,CHECK-ASM64 %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CONSTRAINED-A64 %s // REQUIRES: arm-registered-target,aarch64-registered-target #include -// COMMON-LABEL: test_vrndi_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM64: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s -// COMMONIR: ret <2 x float> [[VRNDI1_I]] +// UNCONSTRAINED-A32-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// UNCONSTRAINED-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-A32-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-A32-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]]) +// UNCONSTRAINED-A32-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// +// UNCONSTRAINED-A64-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// UNCONSTRAINED-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// UNCONSTRAINED-A64-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-A64-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]]) +// UNCONSTRAINED-A64-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// +// CONSTRAINED-A32-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// CONSTRAINED-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-A32-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-A32-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-A32-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// +// CONSTRAINED-A64-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// CONSTRAINED-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CONSTRAINED-A64-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-A64-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.experimental.constrained.nearbyint.v2f32(<2 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2:[0-9]+]] +// CONSTRAINED-A64-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// float32x2_t test_vrndi_f32(float32x2_t a) { return vrndi_f32(a); } -// COMMON-LABEL: test_vrndiq_f32 -// COMMONIR: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// UNCONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) -// CONSTRAINED: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM32: vrintr.f32 s{{[0-9]+}}, s{{[0-9]+}} -// CHECK-ASM64: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s -// COMMONIR: ret <4 x float> [[VRNDI1_I]] +// UNCONSTRAINED-A32-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// UNCONSTRAINED-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-A32-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-A32-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]]) +// UNCONSTRAINED-A32-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// +// UNCONSTRAINED-A64-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// UNCONSTRAINED-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// UNCONSTRAINED-A64-NEXT: [[ENTRY:.*:]] +// UNCONSTRAINED-A64-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]]) +// UNCONSTRAINED-A64-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// +// CONSTRAINED-A32-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// CONSTRAINED-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-A32-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-A32-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-A32-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// +// CONSTRAINED-A64-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// CONSTRAINED-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CONSTRAINED-A64-NEXT: [[ENTRY:.*:]] +// CONSTRAINED-A64-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.experimental.constrained.nearbyint.v4f32(<4 x float> [[A]], metadata !"round.tonearest", metadata !"fpexcept.strict") #[[ATTR2]] +// CONSTRAINED-A64-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// float32x4_t test_vrndiq_f32(float32x4_t a) { return vrndiq_f32(a); } diff --git a/clang/test/CodeGen/arm-neon-directed-rounding.c b/clang/test/CodeGen/arm-neon-directed-rounding.c index 63ec016b49a0c..578ca85311426 100644 --- a/clang/test/CodeGen/arm-neon-directed-rounding.c +++ b/clang/test/CodeGen/arm-neon-directed-rounding.c @@ -1,130 +1,239 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A32 %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CHECK,CHECK-A32 %s // RUN: %clang_cc1 -triple arm64-linux-gnueabihf -target-feature +neon \ // RUN: -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | \ -// RUN: opt -S -passes=mem2reg | FileCheck -check-prefixes=CHECK,CHECK-A64 %s +// RUN: opt -S -passes=mem2reg,instcombine | FileCheck -check-prefixes=CHECK,CHECK-A64 %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnda_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDA_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnda_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRNDA_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnda_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDA1_I]] +// float32x2_t test_vrnda_f32(float32x2_t a) { return vrnda_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndaq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDAQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndaq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDAQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndaq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDA1_I]] +// float32x4_t test_vrndaq_f32(float32x4_t a) { return vrndaq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndm_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDM_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndm_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRNDM_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndm_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDM1_I]] +// float32x2_t test_vrndm_f32(float32x2_t a) { return vrndm_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndmq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDMQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndmq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDMQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndmq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDM1_I]] +// float32x4_t test_vrndmq_f32(float32x4_t a) { return vrndmq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndn_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDN_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndn_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRNDN_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndn_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDN1_I:%.*]] = call <2 x float> @llvm.roundeven.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDN1_I]] +// float32x2_t test_vrndn_f32(float32x2_t a) { return vrndn_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndnq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDNQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndnq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDNQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndnq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDN1_I:%.*]] = call <4 x float> @llvm.roundeven.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDN1_I]] +// float32x4_t test_vrndnq_f32(float32x4_t a) { return vrndnq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndp_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDP_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndp_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRNDP_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndp_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDP1_I]] +// float32x2_t test_vrndp_f32(float32x2_t a) { return vrndp_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndpq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDPQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndpq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDPQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndpq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDP1_I]] +// float32x4_t test_vrndpq_f32(float32x4_t a) { return vrndpq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndx_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a) -// CHECK-A64: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDX_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrndx_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRNDX_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrndx_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDX1_I]] +// float32x2_t test_vrndx_f32(float32x2_t a) { return vrndx_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndxq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDXQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndxq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDXQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndxq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDX1_I]] +// float32x4_t test_vrndxq_f32(float32x4_t a) { return vrndxq_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrnd_f32(<2 x float> noundef %a) -// CHECK-A32: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a) -// CHECK-A64: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRND_V1_I]] +// CHECK-A32-LABEL: define dso_local <2 x float> @test_vrnd_f32( +// CHECK-A32-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[A]]) +// CHECK-A32-NEXT: ret <2 x float> [[VRND_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <2 x float> @test_vrnd_f32( +// CHECK-A64-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[A]]) +// CHECK-A64-NEXT: ret <2 x float> [[VRNDZ1_I]] +// float32x2_t test_vrnd_f32(float32x2_t a) { return vrnd_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndq_f32(<4 x float> noundef %a) -// CHECK-A32: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a) -// CHECK-A64: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDQ_V1_I]] +// CHECK-A32-LABEL: define dso_local <4 x float> @test_vrndq_f32( +// CHECK-A32-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[A]]) +// CHECK-A32-NEXT: ret <4 x float> [[VRNDQ_V1_I]] +// +// CHECK-A64-LABEL: define dso_local <4 x float> @test_vrndq_f32( +// CHECK-A64-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[A]]) +// CHECK-A64-NEXT: ret <4 x float> [[VRNDZ1_I]] +// float32x4_t test_vrndq_f32(float32x4_t a) { return vrndq_f32(a); } -// CHECK-LABEL: define{{.*}} float @test_vrndns_f32(float noundef %a) -// CHECK-A32: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float %a) -// CHECK-A64: [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float %a) -// CHECK: ret float [[VRNDN_I]] +// CHECK-A32-LABEL: define dso_local float @test_vrndns_f32( +// CHECK-A32-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A32-NEXT: [[ENTRY:.*:]] +// CHECK-A32-NEXT: [[VRNDN_I:%.*]] = call float @llvm.arm.neon.vrintn.f32(float [[A]]) +// CHECK-A32-NEXT: ret float [[VRNDN_I]] +// +// CHECK-A64-LABEL: define dso_local float @test_vrndns_f32( +// CHECK-A64-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-A64-NEXT: [[ENTRY:.*:]] +// CHECK-A64-NEXT: [[VRNDN_I:%.*]] = call float @llvm.roundeven.f32(float [[A]]) +// CHECK-A64-NEXT: ret float [[VRNDN_I]] +// float32_t test_vrndns_f32(float32_t a) { return vrndns_f32(a); } -// CHECK-LABEL: define{{.*}} <2 x float> @test_vrndi_f32(<2 x float> noundef %a) -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <2 x float> @test_vrndi_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRNDI_V1_I]] +// float32x2_t test_vrndi_f32(float32x2_t a) { return vrndi_f32(a); } -// CHECK-LABEL: define{{.*}} <4 x float> @test_vrndiq_f32(<4 x float> noundef %a) -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRNDI1_I]] +// CHECK-LABEL: define dso_local <4 x float> @test_vrndiq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRNDIQ_V1_I]] +// float32x4_t test_vrndiq_f32(float32x4_t a) { return vrndiq_f32(a); } diff --git a/clang/test/CodeGen/arm-neon-fma.c b/clang/test/CodeGen/arm-neon-fma.c index 682eda9750c81..64bc2ebbe9bda 100644 --- a/clang/test/CodeGen/arm-neon-fma.c +++ b/clang/test/CodeGen/arm-neon-fma.c @@ -4,7 +4,7 @@ // RUN: -target-cpu cortex-a7 \ // RUN: -mfloat-abi hard \ // RUN: -ffreestanding \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -13,11 +13,8 @@ // CHECK-LABEL: define {{[^@]+}}@test_fma_order // CHECK-SAME: (<2 x float> noundef [[ACCUM:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) { return vfma_f32(accum, lhs, rhs); @@ -26,11 +23,8 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) // CHECK-LABEL: define {{[^@]+}}@test_fmaq_order // CHECK-SAME: (<4 x float> noundef [[ACCUM:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) { return vfmaq_f32(accum, lhs, rhs); @@ -39,13 +33,10 @@ float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) // CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) -// CHECK-NEXT: ret <2 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[N]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] // float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { return vfma_n_f32(a, b, n); @@ -54,15 +45,10 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) { // CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f32 // CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3 -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8> -// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) -// CHECK-NEXT: ret <4 x float> [[TMP3]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[N]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] // float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) { return vfmaq_n_f32(a, b, n); diff --git a/clang/test/CodeGen/arm-neon-numeric-maxmin.c b/clang/test/CodeGen/arm-neon-numeric-maxmin.c index d2d4fee10f079..79d752574c780 100644 --- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c +++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,10 +8,7 @@ // CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> // CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) -// CHECK-NEXT: [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8> // CHECK-NEXT: ret <2 x float> [[VMAXNM_V2_I]] // float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { @@ -19,12 +16,9 @@ float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) { } // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> // CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) -// CHECK-NEXT: [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x float> [[VMAXNMQ_V2_I]] // float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { @@ -34,10 +28,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) { // CHECK-LABEL: define {{[^@]+}}@test_vminnm_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> // CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) -// CHECK-NEXT: [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8> // CHECK-NEXT: ret <2 x float> [[VMINNM_V2_I]] // float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { @@ -45,12 +36,9 @@ float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) { } // CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> // CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) -// CHECK-NEXT: [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x float> [[VMINNMQ_V2_I]] // float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) { diff --git a/clang/test/CodeGen/arm-neon-vcvtX.c b/clang/test/CodeGen/arm-neon-vcvtX.c index c087b92102c5b..e9493fa1cf8cf 100644 --- a/clang/test/CodeGen/arm-neon-vcvtX.c +++ b/clang/test/CodeGen/arm-neon-vcvtX.c @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature -// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target @@ -8,7 +8,6 @@ // CHECK-LABEL: define {{[^@]+}}@test_vcvta_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTA_S32_V1_I]] // @@ -19,7 +18,6 @@ int32x2_t test_vcvta_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvta_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTA_U32_V1_I]] // @@ -28,9 +26,8 @@ uint32x2_t test_vcvta_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTAQ_S32_V1_I]] // @@ -39,9 +36,8 @@ int32x4_t test_vcvtaq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTAQ_U32_V1_I]] // @@ -52,7 +48,6 @@ uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTN_S32_V1_I]] // @@ -63,7 +58,6 @@ int32x2_t test_vcvtn_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTN_U32_V1_I]] // @@ -72,9 +66,8 @@ uint32x2_t test_vcvtn_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTNQ_S32_V1_I]] // @@ -83,9 +76,8 @@ int32x4_t test_vcvtnq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTNQ_U32_V1_I]] // @@ -96,7 +88,6 @@ uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTP_S32_V1_I]] // @@ -107,7 +98,6 @@ int32x2_t test_vcvtp_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTP_U32_V1_I]] // @@ -116,9 +106,8 @@ uint32x2_t test_vcvtp_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTPQ_S32_V1_I]] // @@ -127,9 +116,8 @@ int32x4_t test_vcvtpq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTPQ_U32_V1_I]] // @@ -140,7 +128,6 @@ uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTM_S32_V1_I]] // @@ -151,7 +138,6 @@ int32x2_t test_vcvtm_s32_f32(float32x2_t a) { // CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u32_f32 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> // CHECK-NEXT: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]]) // CHECK-NEXT: ret <2 x i32> [[VCVTM_U32_V1_I]] // @@ -160,9 +146,8 @@ uint32x2_t test_vcvtm_u32_f32(float32x2_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTMQ_S32_V1_I]] // @@ -171,9 +156,8 @@ int32x4_t test_vcvtmq_s32_f32(float32x4_t a) { } // CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u32_f32 -// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] { +// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> // CHECK-NEXT: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]]) // CHECK-NEXT: ret <4 x i32> [[VCVTMQ_U32_V1_I]] // diff --git a/clang/test/CodeGen/arm-poly-add.c b/clang/test/CodeGen/arm-poly-add.c index 201a03a5bc8b6..187ac6e8cd6a7 100644 --- a/clang/test/CodeGen/arm-poly-add.c +++ b/clang/test/CodeGen/arm-poly-add.c @@ -1,73 +1,68 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // REQUIRES: arm-registered-target // RUN: %clang_cc1 -triple armv8.2a-arm-none-eabi \ // RUN: -target-feature +neon \ // RUN: -mfloat-abi hard \ -// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg \ +// RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s #include -// CHECK-LABEL: @test_vadd_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[A:%.*]], [[B:%.*]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <8 x i8> @test_vadd_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[A]], [[B]] // CHECK-NEXT: ret <8 x i8> [[TMP0]] // poly8x8_t test_vadd_p8(poly8x8_t a, poly8x8_t b) { return vadd_p8 (a, b); } -// CHECK-LABEL: @test_vadd_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <4 x i16> @test_vadd_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] // poly16x4_t test_vadd_p16(poly16x4_t a, poly16x4_t b) { return vadd_p16 (a, b); } -// CHECK-LABEL: @test_vadd_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64> -// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <1 x i64> @test_vadd_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] // poly64x1_t test_vadd_p64(poly64x1_t a, poly64x1_t b) { return vadd_p64(a, b); } -// CHECK-LABEL: @test_vaddq_p8( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[A:%.*]], [[B:%.*]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <16 x i8> @test_vaddq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[A]], [[B]] // CHECK-NEXT: ret <16 x i8> [[TMP0]] // poly8x16_t test_vaddq_p8(poly8x16_t a, poly8x16_t b){ return vaddq_p8(a, b); } -// CHECK-LABEL: @test_vaddq_p16( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> -// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <8 x i16> @test_vaddq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] // poly16x8_t test_vaddq_p16(poly16x8_t a, poly16x8_t b){ return vaddq_p16(a, b); } -// CHECK-LABEL: @test_vaddq_p64( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = xor <16 x i8> [[TMP0]], [[TMP1]] -// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> -// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define dso_local arm_aapcs_vfpcc <2 x i64> @test_vaddq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] // poly64x2_t test_vaddq_p64(poly64x2_t a, poly64x2_t b){ return vaddq_p64(a, b); diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c index 555f8ccba7c3c..a15c67ff46487 100644 --- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clang_cc1 -triple armv8.1a-linux-gnu -target-abi apcs-gnu -target-feature +neon \ -// RUN: -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \ +// RUN: -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,instcombine,dce -S \ // RUN: | FileCheck %s --check-prefix=CHECK-ARM // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \ -// RUN: -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,dce -S \ +// RUN: -target-feature +v8.1a -emit-llvm -o - %s -disable-O0-optnone | opt -passes=mem2reg,instcombine,dce -S \ // RUN: | FileCheck %s --check-prefix=CHECK-AARCH64 // REQUIRES: arm-registered-target,aarch64-registered-target @@ -13,13 +13,13 @@ // CHECK-ARM-LABEL: @test_vqrdmlah_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_S163_I]] // int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -28,13 +28,13 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlah_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_S323_I]] // int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -43,13 +43,13 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_S163_I]] // int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -58,13 +58,13 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_S323_I]] // int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -73,19 +73,15 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlah_lane_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_S163_I]] // int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -94,19 +90,15 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlah_lane_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_S323_I]] // int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -115,19 +107,15 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_S163_I]] // int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -136,19 +124,15 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlahq_lane_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_S323_I]] // int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { @@ -157,13 +141,13 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_S163_I]] // int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -172,13 +156,13 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_S323_I]] // int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -187,13 +171,13 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_S163_I]] // int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { @@ -202,13 +186,13 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_S323_I]] // int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { @@ -217,19 +201,15 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S163_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_S163_I]] // int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { @@ -238,19 +218,15 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlsh_lane_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <2 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSH_S323_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_S323_I]] // int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { @@ -259,19 +235,15 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s16( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_S163_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C:%.*]], <4 x i16> poison, <8 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S163_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_S163_I]] // int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { @@ -280,19 +252,15 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { // CHECK-ARM-LABEL: @test_vqrdmlshq_lane_s32( // CHECK-ARM-NEXT: entry: -// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> +// CHECK-ARM-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_S323_I]] // // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32( // CHECK-AARCH64-NEXT: entry: -// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) -// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]] +// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C:%.*]], <2 x i32> poison, <4 x i32> +// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_S323_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) +// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_S323_I]] // int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c index 5f1cb34e6603d..d8ed462d97252 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics-generic.c @@ -1,11 +1,11 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature -fullfp16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=sroa \ +// RUN: | opt -S -passes=mem2reg,sroa,instcombine \ // RUN: | FileCheck %s --check-prefixes=CHECK-NOFP16 // RUN: %clang_cc1 -triple armv8a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=sroa \ +// RUN: | opt -S -passes=mem2reg,sroa,instcombine \ // RUN: | FileCheck %s --check-prefixes=CHECK-FP16 // REQUIRES: arm-registered-target @@ -15,21 +15,12 @@ // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vbsl_f16( // CHECK-NOFP16-SAME: <4 x i16> noundef [[A:%.*]], <2 x i32> noundef [[B_COERCE:%.*]], <2 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[C_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> [[TMP8]]) -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP12]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C_COERCE]] to <8 x i8> +// CHECK-NOFP16-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP3]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vbsl_f16( // CHECK-FP16-SAME: <4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { @@ -48,21 +39,12 @@ float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) { // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vbslq_f16( // CHECK-NOFP16-SAME: <8 x i16> noundef [[A:%.*]], <4 x i32> noundef [[B_COERCE:%.*]], <4 x i32> noundef [[C_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[C_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP12]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C_COERCE]] to <16 x i8> +// CHECK-NOFP16-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP3]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vbslq_f16( // CHECK-FP16-SAME: <8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { @@ -81,35 +63,23 @@ float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) { // CHECK-NOFP16-LABEL: define dso_local void @test_vzip_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META3]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VZIP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vzip_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <4 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 // CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META3]] +// CHECK-FP16-NEXT: store <4 x half> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { @@ -119,35 +89,23 @@ float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-LABEL: define dso_local void @test_vzipq_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META6]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NOFP16-NEXT: [[VZIP3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VZIP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vzipq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-FP16-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <8 x half> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 // CHECK-FP16-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META6]] +// CHECK-FP16-NEXT: store <8 x half> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META6]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { @@ -157,35 +115,23 @@ float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-LABEL: define dso_local void @test_vuzp_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META9]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VUZP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vuzp_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <4 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 // CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META9]] +// CHECK-FP16-NEXT: store <4 x half> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META9]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { @@ -195,35 +141,23 @@ float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-LABEL: define dso_local void @test_vuzpq_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP3_I]], ptr [[TMP10]], align 4, !alias.scope [[META12]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NOFP16-NEXT: [[VUZP3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VUZP3_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vuzpq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-FP16-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <8 x half> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 // CHECK-FP16-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope [[META12]] +// CHECK-FP16-NEXT: store <8 x half> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { @@ -233,35 +167,23 @@ float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-LABEL: define dso_local void @test_vtrn_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], <4 x i32> -// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META15]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: store <4 x i16> [[VTRN3_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vtrn_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> // CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <4 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 // CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> -// CHECK-FP16-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META15]] +// CHECK-FP16-NEXT: store <4 x half> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META15]] // CHECK-FP16-NEXT: ret void // float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { @@ -271,35 +193,23 @@ float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-LABEL: define dso_local void @test_vtrnq_f16( // CHECK-NOFP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x i32> -// CHECK-NOFP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]] -// CHECK-NOFP16-NEXT: [[TMP10:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], <8 x i32> -// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN3_I]], ptr [[TMP10]], align 4, !alias.scope [[META18]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]] +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NOFP16-NEXT: [[VTRN3_I:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: store <8 x i16> [[VTRN3_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]] // CHECK-NOFP16-NEXT: ret void // // CHECK-FP16-LABEL: define dso_local void @test_vtrnq_f16( // CHECK-FP16-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]]) -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> // CHECK-FP16-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18]] -// CHECK-FP16-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x half>, ptr [[AGG_RESULT]], i32 1 +// CHECK-FP16-NEXT: store <8 x half> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]] +// CHECK-FP16-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 // CHECK-FP16-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> -// CHECK-FP16-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope [[META18]] +// CHECK-FP16-NEXT: store <8 x half> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META18]] // CHECK-FP16-NEXT: ret void // float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { @@ -309,20 +219,16 @@ float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vmov_n_f16( // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-NOFP16-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NOFP16-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32> // CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP0]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vmov_n_f16( // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-FP16-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-FP16-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-FP16-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vmov_n_f16(float16_t a) { @@ -332,28 +238,16 @@ float16x4_t test_vmov_n_f16(float16_t a) { // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vmovq_n_f16( // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-NOFP16-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NOFP16-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-NOFP16-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-NOFP16-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-NOFP16-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-NOFP16-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-NOFP16-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32> // CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP0]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vmovq_n_f16( // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-FP16-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-FP16-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-FP16-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-FP16-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-FP16-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-FP16-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-FP16-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-FP16-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vmovq_n_f16(float16_t a) { @@ -363,20 +257,16 @@ float16x8_t test_vmovq_n_f16(float16_t a) { // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vdup_n_f16( // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-NOFP16-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NOFP16-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VECINIT3]] to <2 x i32> // CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP0]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_n_f16( // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i32 0 -// CHECK-FP16-NEXT: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-FP16-NEXT: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3 +// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[A]], i64 0 +// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer // CHECK-FP16-NEXT: ret <4 x half> [[VECINIT3]] // float16x4_t test_vdup_n_f16(float16_t a) { @@ -386,28 +276,16 @@ float16x4_t test_vdup_n_f16(float16_t a) { // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_n_f16( // CHECK-NOFP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-NOFP16-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-NOFP16-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-NOFP16-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-NOFP16-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-NOFP16-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-NOFP16-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-NOFP16-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-NOFP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-NOFP16-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VECINIT7]] to <4 x i32> // CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP0]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_n_f16( // CHECK-FP16-SAME: half noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i32 0 -// CHECK-FP16-NEXT: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1 -// CHECK-FP16-NEXT: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2 -// CHECK-FP16-NEXT: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3 -// CHECK-FP16-NEXT: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4 -// CHECK-FP16-NEXT: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5 -// CHECK-FP16-NEXT: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6 -// CHECK-FP16-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7 +// CHECK-FP16-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[A]], i64 0 +// CHECK-FP16-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer // CHECK-FP16-NEXT: ret <8 x half> [[VECINIT7]] // float16x8_t test_vdupq_n_f16(float16_t a) { @@ -418,19 +296,14 @@ float16x8_t test_vdupq_n_f16(float16_t a) { // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP4]] +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP1]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP2]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vdup_lane_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> // CHECK-FP16-NEXT: ret <4 x half> [[LANE]] // float16x4_t test_vdup_lane_f16(float16x4_t a) { @@ -440,20 +313,15 @@ float16x4_t test_vdup_lane_f16(float16x4_t a) { // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vdupq_lane_f16( // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <8 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP4]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[LANE]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP1]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vdupq_lane_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK-FP16-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <8 x i32> // CHECK-FP16-NEXT: ret <8 x half> [[LANE]] // float16x8_t test_vdupq_lane_f16(float16x4_t a) { @@ -463,25 +331,16 @@ float16x8_t test_vdupq_lane_f16(float16x4_t a) { // CHECK-NOFP16-LABEL: define dso_local <2 x i32> @test_vext_f16( // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]], <2 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[TMP0]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[TMP1]] to <8 x i8> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16> -// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[VEXT]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP7]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B_COERCE]] to <4 x i16> +// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VEXT]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP2]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vext_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> +// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> // CHECK-FP16-NEXT: ret <4 x half> [[VEXT]] // float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { @@ -491,25 +350,16 @@ float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) { // CHECK-NOFP16-LABEL: define dso_local <4 x i32> @test_vextq_f16( // CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]], <4 x i32> noundef [[B_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: -// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[TMP0]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[TMP1]] to <16 x i8> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP3]] to <8 x i16> -// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> -// CHECK-NOFP16-NEXT: [[TMP6:%.*]] = bitcast <8 x i16> [[VEXT]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP7]] +// CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B_COERCE]] to <8 x i16> +// CHECK-NOFP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[VEXT]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP2]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vextq_f16( // CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8> -// CHECK-FP16-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> -// CHECK-FP16-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> +// CHECK-FP16-NEXT: [[VEXT:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> // CHECK-FP16-NEXT: ret <8 x half> [[VEXT]] // float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { @@ -520,18 +370,14 @@ float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) { // CHECK-NOFP16-SAME: <2 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A_COERCE]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[TMP0]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x half> -// CHECK-NOFP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP2]], <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <2 x i32> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <4 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <2 x i32> -// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP5]] +// CHECK-NOFP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[SHUFFLE_I]] to <2 x i32> +// CHECK-NOFP16-NEXT: ret <2 x i32> [[TMP1]] // // CHECK-FP16-LABEL: define dso_local <4 x half> @test_vrev64_f16( // CHECK-FP16-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> +// CHECK-FP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <4 x i32> // CHECK-FP16-NEXT: ret <4 x half> [[SHUFFLE_I]] // float16x4_t test_vrev64_f16(float16x4_t a) { @@ -542,18 +388,14 @@ float16x4_t test_vrev64_f16(float16x4_t a) { // CHECK-NOFP16-SAME: <4 x i32> noundef [[A_COERCE:%.*]]) #[[ATTR0]] { // CHECK-NOFP16-NEXT: entry: // CHECK-NOFP16-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A_COERCE]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x half> -// CHECK-NOFP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP2]], <8 x i32> -// CHECK-NOFP16-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[SHUFFLE_I]] to <4 x i32> -// CHECK-NOFP16-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x half> -// CHECK-NOFP16-NEXT: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x i32> -// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP5]] +// CHECK-NOFP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP0]], <8 x half> poison, <8 x i32> +// CHECK-NOFP16-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[SHUFFLE_I]] to <4 x i32> +// CHECK-NOFP16-NEXT: ret <4 x i32> [[TMP1]] // // CHECK-FP16-LABEL: define dso_local <8 x half> @test_vrev64q_f16( // CHECK-FP16-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-FP16-NEXT: entry: -// CHECK-FP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> +// CHECK-FP16-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <8 x i32> // CHECK-FP16-NEXT: ret <8 x half> [[SHUFFLE_I]] // float16x8_t test_vrev64q_f16(float16x8_t a) { diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c index 59f56b988d2ab..fc8ae4f9a1c04 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c @@ -1,819 +1,1138 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple armv8.2a-linux-gnu -target-abi apcs-gnu -target-feature +neon -target-feature +fullfp16 \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg \ +// RUN: | opt -S -passes=mem2reg,instcombine \ // RUN: | FileCheck %s // REQUIRES: arm-registered-target #include -// CHECK-LABEL: test_vabs_f16 -// CHECK: [[ABS:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[ABS]] +// CHECK-LABEL: define dso_local <4 x half> @test_vabs_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VABS1_I]] +// float16x4_t test_vabs_f16(float16x4_t a) { return vabs_f16(a); } -// CHECK-LABEL: test_vabsq_f16 -// CHECK: [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[ABS]] +// CHECK-LABEL: define dso_local <8 x half> @test_vabsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VABS1_I]] +// float16x8_t test_vabsq_f16(float16x8_t a) { return vabsq_f16(a); } -// CHECK-LABEL: test_vceqz_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceqz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCEQZ_I]] +// uint16x4_t test_vceqz_f16(float16x4_t a) { return vceqz_f16(a); } -// CHECK-LABEL: test_vceqzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCEQZ_I]] +// uint16x8_t test_vceqzq_f16(float16x8_t a) { return vceqzq_f16(a); } -// CHECK-LABEL: test_vcgez_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgez_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGEZ_I]] +// uint16x4_t test_vcgez_f16(float16x4_t a) { return vcgez_f16(a); } -// CHECK-LABEL: test_vcgezq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgezq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGEZ_I]] +// uint16x8_t test_vcgezq_f16(float16x8_t a) { return vcgezq_f16(a); } -// CHECK-LABEL: test_vcgtz_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgtz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCGTZ_I]] +// uint16x4_t test_vcgtz_f16(float16x4_t a) { return vcgtz_f16(a); } -// CHECK-LABEL: test_vcgtzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCGTZ_I]] +// uint16x8_t test_vcgtzq_f16(float16x8_t a) { return vcgtzq_f16(a); } -// CHECK-LABEL: test_vclez_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclez_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLEZ_I]] +// uint16x4_t test_vclez_f16(float16x4_t a) { return vclez_f16(a); } -// CHECK-LABEL: test_vclezq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vclezq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLEZ_I]] +// uint16x8_t test_vclezq_f16(float16x8_t a) { return vclezq_f16(a); } -// CHECK-LABEL: test_vcltz_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcltz_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCLTZ_I]] +// uint16x4_t test_vcltz_f16(float16x4_t a) { return vcltz_f16(a); } -// CHECK-LABEL: test_vcltzq_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltzq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer +// CHECK-NEXT: [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCLTZ_I]] +// uint16x8_t test_vcltzq_f16(float16x8_t a) { return vcltzq_f16(a); } -// CHECK-LABEL: test_vcvt_f16_s16 -// CHECK: [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[VCVT_I]] +// float16x4_t test_vcvt_f16_s16 (int16x4_t a) { return vcvt_f16_s16(a); } -// CHECK-LABEL: test_vcvtq_f16_s16 -// CHECK: [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[VCVT_I]] +// float16x8_t test_vcvtq_f16_s16 (int16x8_t a) { return vcvtq_f16_s16(a); } -// CHECK-LABEL: test_vcvt_f16_u16 -// CHECK: [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[VCVT_I]] +// float16x4_t test_vcvt_f16_u16 (uint16x4_t a) { return vcvt_f16_u16(a); } -// CHECK-LABEL: test_vcvtq_f16_u16 -// CHECK: [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[VCVT_I]] +// float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) { return vcvtq_f16_u16(a); } -// CHECK-LABEL: test_vcvt_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCVT_I]] +// int16x4_t test_vcvt_s16_f16 (float16x4_t a) { return vcvt_s16_f16(a); } -// CHECK-LABEL: test_vcvtq_s16_f16 -// CHECK: [[VCVT:%.*]] = fptosi <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCVT_I]] +// int16x8_t test_vcvtq_s16_f16 (float16x8_t a) { return vcvtq_s16_f16(a); } -// CHECK-LABEL: test_vcvt_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VCVT_I]] +// int16x4_t test_vcvt_u16_f16 (float16x4_t a) { return vcvt_u16_f16(a); } -// CHECK-LABEL: test_vcvtq_u16_f16 -// CHECK: [[VCVT:%.*]] = fptoui <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VCVT_I]] +// int16x8_t test_vcvtq_u16_f16 (float16x8_t a) { return vcvtq_u16_f16(a); } -// CHECK-LABEL: test_vcvta_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtas.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTA_S16_F161_I]] +// int16x4_t test_vcvta_s16_f16 (float16x4_t a) { return vcvta_s16_f16(a); } -// CHECK-LABEL: test_vcvta_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvta_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTA_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtau.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTA_U16_F161_I]] +// int16x4_t test_vcvta_u16_f16 (float16x4_t a) { return vcvta_u16_f16(a); } -// CHECK-LABEL: test_vcvtaq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtaq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTAQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtas.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTAQ_S16_F161_I]] +// int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) { return vcvtaq_s16_f16(a); } -// CHECK-LABEL: test_vcvtm_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtms.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTM_S16_F161_I]] +// int16x4_t test_vcvtm_s16_f16 (float16x4_t a) { return vcvtm_s16_f16(a); } -// CHECK-LABEL: test_vcvtmq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTMQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtms.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTMQ_S16_F161_I]] +// int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) { return vcvtmq_s16_f16(a); } -// CHECK-LABEL: test_vcvtm_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtm_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTM_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtmu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTM_U16_F161_I]] +// uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) { return vcvtm_u16_f16(a); } -// CHECK-LABEL: test_vcvtmq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtmq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTMQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtmu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTMQ_U16_F161_I]] +// uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) { return vcvtmq_u16_f16(a); } -// CHECK-LABEL: test_vcvtn_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtns.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTN_S16_F161_I]] +// int16x4_t test_vcvtn_s16_f16 (float16x4_t a) { return vcvtn_s16_f16(a); } -// CHECK-LABEL: test_vcvtnq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTNQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtns.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTNQ_S16_F161_I]] +// int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) { return vcvtnq_s16_f16(a); } -// CHECK-LABEL: test_vcvtn_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtn_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTN_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtnu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTN_U16_F161_I]] +// uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) { return vcvtn_u16_f16(a); } -// CHECK-LABEL: test_vcvtnq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtnq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTNQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtnu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTNQ_U16_F161_I]] +// uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) { return vcvtnq_u16_f16(a); } -// CHECK-LABEL: test_vcvtp_s16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP_S16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtps.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTP_S16_F161_I]] +// int16x4_t test_vcvtp_s16_f16 (float16x4_t a) { return vcvtp_s16_f16(a); } -// CHECK-LABEL: test_vcvtpq_s16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTPQ_S16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtps.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTPQ_S16_F161_I]] +// int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) { return vcvtpq_s16_f16(a); } -// CHECK-LABEL: test_vcvtp_u16_f16 -// CHECK: [[VCVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> %a) -// CHECK: ret <4 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvtp_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTP_U16_F161_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtpu.v4i16.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCVTP_U16_F161_I]] +// uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) { return vcvtp_u16_f16(a); } -// CHECK-LABEL: test_vcvtpq_u16_f16 -// CHECK: [[VCVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> %a) -// CHECK: ret <8 x i16> [[VCVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtpq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVTPQ_U16_F161_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtpu.v8i16.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCVTPQ_U16_F161_I]] +// uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) { return vcvtpq_u16_f16(a); } // FIXME: Fix the zero constant when fp16 non-storage-only type becomes available. -// CHECK-LABEL: test_vneg_f16 -// CHECK: [[NEG:%.*]] = fneg <4 x half> %a -// CHECK: ret <4 x half> [[NEG]] +// CHECK-LABEL: define dso_local <4 x half> @test_vneg_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[A]] +// CHECK-NEXT: ret <4 x half> [[FNEG_I]] +// float16x4_t test_vneg_f16(float16x4_t a) { return vneg_f16(a); } -// CHECK-LABEL: test_vnegq_f16 -// CHECK: [[NEG:%.*]] = fneg <8 x half> %a -// CHECK: ret <8 x half> [[NEG]] +// CHECK-LABEL: define dso_local <8 x half> @test_vnegq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[A]] +// CHECK-NEXT: ret <8 x half> [[FNEG_I]] +// float16x8_t test_vnegq_f16(float16x8_t a) { return vnegq_f16(a); } -// CHECK-LABEL: test_vrecpe_f16 -// CHECK: [[RCP:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RCP]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrecpe_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecpe.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRECPE_V1_I]] +// float16x4_t test_vrecpe_f16(float16x4_t a) { return vrecpe_f16(a); } -// CHECK-LABEL: test_vrecpeq_f16 -// CHECK: [[RCP:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RCP]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrecpeq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecpe.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRECPEQ_V1_I]] +// float16x8_t test_vrecpeq_f16(float16x8_t a) { return vrecpeq_f16(a); } -// CHECK-LABEL: test_vrnd_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrnd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRND_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintz.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRND_V1_I]] +// float16x4_t test_vrnd_f16(float16x4_t a) { return vrnd_f16(a); } -// CHECK-LABEL: test_vrndq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintz.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDQ_V1_I]] +// float16x8_t test_vrndq_f16(float16x8_t a) { return vrndq_f16(a); } -// CHECK-LABEL: test_vrnda_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrnda_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDA_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrinta.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRNDA_V1_I]] +// float16x4_t test_vrnda_f16(float16x4_t a) { return vrnda_f16(a); } -// CHECK-LABEL: test_vrndaq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndaq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDAQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrinta.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDAQ_V1_I]] +// float16x8_t test_vrndaq_f16(float16x8_t a) { return vrndaq_f16(a); } -// CHECK-LABEL: test_vrndm_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDM_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintm.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRNDM_V1_I]] +// float16x4_t test_vrndm_f16(float16x4_t a) { return vrndm_f16(a); } -// CHECK-LABEL: test_vrndmq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDMQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintm.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDMQ_V1_I]] +// float16x8_t test_vrndmq_f16(float16x8_t a) { return vrndmq_f16(a); } -// CHECK-LABEL: test_vrndn_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndn_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDN_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintn.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRNDN_V1_I]] +// float16x4_t test_vrndn_f16(float16x4_t a) { return vrndn_f16(a); } -// CHECK-LABEL: test_vrndnq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndnq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDNQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintn.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDNQ_V1_I]] +// float16x8_t test_vrndnq_f16(float16x8_t a) { return vrndnq_f16(a); } -// CHECK-LABEL: test_vrndp_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndp_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDP_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintp.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRNDP_V1_I]] +// float16x4_t test_vrndp_f16(float16x4_t a) { return vrndp_f16(a); } -// CHECK-LABEL: test_vrndpq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndpq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDPQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintp.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDPQ_V1_I]] +// float16x8_t test_vrndpq_f16(float16x8_t a) { return vrndpq_f16(a); } -// CHECK-LABEL: test_vrndx_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrndx_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDX_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrintx.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRNDX_V1_I]] +// float16x4_t test_vrndx_f16(float16x4_t a) { return vrndx_f16(a); } -// CHECK-LABEL: test_vrndxq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrndxq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRNDXQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrintx.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRNDXQ_V1_I]] +// float16x8_t test_vrndxq_f16(float16x8_t a) { return vrndxq_f16(a); } -// CHECK-LABEL: test_vrsqrte_f16 -// CHECK: [[RND:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> %a) -// CHECK: ret <4 x half> [[RND]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrte_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrte.v4f16(<4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[VRSQRTE_V1_I]] +// float16x4_t test_vrsqrte_f16(float16x4_t a) { return vrsqrte_f16(a); } -// CHECK-LABEL: test_vrsqrteq_f16 -// CHECK: [[RND:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> %a) -// CHECK: ret <8 x half> [[RND]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrteq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrte.v8f16(<8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[VRSQRTEQ_V1_I]] +// float16x8_t test_vrsqrteq_f16(float16x8_t a) { return vrsqrteq_f16(a); } -// CHECK-LABEL: test_vadd_f16 -// CHECK: [[ADD:%.*]] = fadd <4 x half> %a, %b -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vadd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[ADD_I]] +// float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) { return vadd_f16(a, b); } -// CHECK-LABEL: test_vaddq_f16 -// CHECK: [[ADD:%.*]] = fadd <8 x half> %a, %b -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vaddq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[ADD_I]] +// float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) { return vaddq_f16(a, b); } -// CHECK-LABEL: test_vabd_f16 -// CHECK: [[ABD:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[ABD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vabd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vabds.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VABD_V2_I]] +// float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) { return vabd_f16(a, b); } -// CHECK-LABEL: test_vabdq_f16 -// CHECK: [[ABD:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[ABD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vabdq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vabds.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VABDQ_V2_I]] +// float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) { return vabdq_f16(a, b); } -// CHECK-LABEL: test_vcage_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcage_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VCAGE_V2_I]] +// uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) { return vcage_f16(a, b); } -// CHECK-LABEL: test_vcageq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcageq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VCAGEQ_V2_I]] +// uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) { return vcageq_f16(a, b); } -// CHECK-LABEL: test_vcagt_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcagt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VCAGT_V2_I]] +// uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) { return vcagt_f16(a, b); } -// CHECK-LABEL: test_vcagtq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcagtq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VCAGTQ_V2_I]] +// uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) { return vcagtq_f16(a, b); } -// CHECK-LABEL: test_vcale_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> %b, <4 x half> %a) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcale_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCALE_V2_I]] +// uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) { return vcale_f16(a, b); } -// CHECK-LABEL: test_vcaleq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> %b, <8 x half> %a) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcaleq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCALEQ_V2_I]] +// uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) { return vcaleq_f16(a, b); } -// CHECK-LABEL: test_vcalt_f16 -// CHECK: [[ABS:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a) -// CHECK: ret <4 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcalt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vacgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCALT_V2_I]] +// uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) { return vcalt_f16(a, b); } -// CHECK-LABEL: test_vcaltq_f16 -// CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a) -// CHECK: ret <8 x i16> [[ABS]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcaltq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vacgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCALTQ_V2_I]] +// uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) { return vcaltq_f16(a, b); } -// CHECK-LABEL: test_vceq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vceq_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) { return vceq_f16(a, b); } -// CHECK-LABEL: test_vceqq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vceqq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) { return vceqq_f16(a, b); } -// CHECK-LABEL: test_vcge_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcge_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) { return vcge_f16(a, b); } -// CHECK-LABEL: test_vcgeq_f16 -// CHECK: [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgeq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) { return vcgeq_f16(a, b); } -// CHECK-LABEL: test_vcgt_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcgt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) { return vcgt_f16(a, b); } -// CHECK-LABEL: test_vcgtq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcgtq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) { return vcgtq_f16(a, b); } -// CHECK-LABEL: test_vcle_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcle_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) { return vcle_f16(a, b); } -// CHECK-LABEL: test_vcleq_f16 -// CHECK: [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcleq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) { return vcleq_f16(a, b); } -// CHECK-LABEL: test_vclt_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vclt_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) { return vclt_f16(a, b); } -// CHECK-LABEL: test_vcltq_f16 -// CHECK: [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b -// CHECK: [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP2]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcltq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <8 x half> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) { return vcltq_f16(a, b); } -// CHECK-LABEL: test_vcvt_n_f16_s16 -// CHECK: [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2) -// CHECK: ret <4 x half> [[CVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2) +// CHECK-NEXT: ret <4 x half> [[VCVT_N1]] +// float16x4_t test_vcvt_n_f16_s16(int16x4_t a) { return vcvt_n_f16_s16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_f16_s16 -// CHECK: [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2) -// CHECK: ret <8 x half> [[CVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2) +// CHECK-NEXT: ret <8 x half> [[VCVT_N1]] +// float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) { return vcvtq_n_f16_s16(a, 2); } -// CHECK-LABEL: test_vcvt_n_f16_u16 -// CHECK: [[CVT:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2) -// CHECK: ret <4 x half> [[CVT]] +// CHECK-LABEL: define dso_local <4 x half> @test_vcvt_n_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x half> @llvm.arm.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[A]], i32 2) +// CHECK-NEXT: ret <4 x half> [[VCVT_N1]] +// float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) { return vcvt_n_f16_u16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_f16_u16 -// CHECK: [[CVT:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2) -// CHECK: ret <8 x half> [[CVT]] +// CHECK-LABEL: define dso_local <8 x half> @test_vcvtq_n_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x half> @llvm.arm.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[A]], i32 2) +// CHECK-NEXT: ret <8 x half> [[VCVT_N1]] +// float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) { return vcvtq_n_f16_u16(a, 2); } -// CHECK-LABEL: test_vcvt_n_s16_f16 -// CHECK: [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2) -// CHECK: ret <4 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[A]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] +// int16x4_t test_vcvt_n_s16_f16(float16x4_t a) { return vcvt_n_s16_f16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_s16_f16 -// CHECK: [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2) -// CHECK: ret <8 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[A]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] +// int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) { return vcvtq_n_s16_f16(a, 2); } -// CHECK-LABEL: test_vcvt_n_u16_f16 -// CHECK: [[CVT:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2) -// CHECK: ret <4 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <4 x i16> @test_vcvt_n_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[A]], i32 2) +// CHECK-NEXT: ret <4 x i16> [[VCVT_N1]] +// uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) { return vcvt_n_u16_f16(a, 2); } -// CHECK-LABEL: test_vcvtq_n_u16_f16 -// CHECK: [[CVT:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2) -// CHECK: ret <8 x i16> [[CVT]] +// CHECK-LABEL: define dso_local <8 x i16> @test_vcvtq_n_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[A]], i32 2) +// CHECK-NEXT: ret <8 x i16> [[VCVT_N1]] +// uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) { return vcvtq_n_u16_f16(a, 2); } -// CHECK-LABEL: test_vmax_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmax_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxs.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VMAX_V2_I]] +// float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) { return vmax_f16(a, b); } -// CHECK-LABEL: test_vmaxq_f16 -// CHECK: [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MAX]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmaxq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxs.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VMAXQ_V2_I]] +// float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) { return vmaxq_f16(a, b); } -// CHECK-LABEL: test_vmaxnm_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmaxnm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VMAXNM_V2_I]] +// float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) { return vmaxnm_f16(a, b); } -// CHECK-LABEL: test_vmaxnmq_f16 -// CHECK: [[MAX:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MAX]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmaxnmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VMAXNMQ_V2_I]] +// float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) { return vmaxnmq_f16(a, b); } -// CHECK-LABEL: test_vmin_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmin_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vmins.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VMIN_V2_I]] +// float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) { return vmin_f16(a, b); } -// CHECK-LABEL: test_vminq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vminq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vmins.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VMINQ_V2_I]] +// float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) { return vminq_f16(a, b); } -// CHECK-LABEL: test_vminnm_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vminnm_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VMINNM_V2_I]] +// float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) { return vminnm_f16(a, b); } -// CHECK-LABEL: test_vminnmq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vminnmq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VMINNMQ_V2_I]] +// float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) { return vminnmq_f16(a, b); } -// CHECK-LABEL: test_vmul_f16 -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, %b -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[MUL_I]] +// float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) { return vmul_f16(a, b); } -// CHECK-LABEL: test_vmulq_f16 -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, %b -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[MUL_I]] +// float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) { return vmulq_f16(a, b); } -// CHECK-LABEL: test_vpadd_f16 -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpadd_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VPADD_V2_I]] +// float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) { return vpadd_f16(a, b); } -// CHECK-LABEL: test_vpmax_f16 -// CHECK: [[MAX:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MAX]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpmax_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmaxs.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VPMAX_V2_I]] +// float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) { return vpmax_f16(a, b); } -// CHECK-LABEL: test_vpmin_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vpmin_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vpmins.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VPMIN_V2_I]] +// float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) { return vpmin_f16(a, b); } -// CHECK-LABEL: test_vrecps_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrecps_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VRECPS_V2_I]] +// float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) { return vrecps_f16(a, b); } -// CHECK-LABEL: test_vrecpsq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrecpsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VRECPSQ_V2_I]] +// float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) { return vrecpsq_f16(a, b); } -// CHECK-LABEL: test_vrsqrts_f16 -// CHECK: [[MIN:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> %a, <4 x half> %b) -// CHECK: ret <4 x half> [[MIN]] +// CHECK-LABEL: define dso_local <4 x half> @test_vrsqrts_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.arm.neon.vrsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) +// CHECK-NEXT: ret <4 x half> [[VRSQRTS_V2_I]] +// float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) { return vrsqrts_f16(a, b); } -// CHECK-LABEL: test_vrsqrtsq_f16 -// CHECK: [[MIN:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> %a, <8 x half> %b) -// CHECK: ret <8 x half> [[MIN]] +// CHECK-LABEL: define dso_local <8 x half> @test_vrsqrtsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.arm.neon.vrsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) +// CHECK-NEXT: ret <8 x half> [[VRSQRTSQ_V2_I]] +// float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) { return vrsqrtsq_f16(a, b); } -// CHECK-LABEL: test_vsub_f16 -// CHECK: [[ADD:%.*]] = fsub <4 x half> %a, %b -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vsub_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x half> [[A]], [[B]] +// CHECK-NEXT: ret <4 x half> [[SUB_I]] +// float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) { return vsub_f16(a, b); } -// CHECK-LABEL: test_vsubq_f16 -// CHECK: [[ADD:%.*]] = fsub <8 x half> %a, %b -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vsubq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <8 x half> [[A]], [[B]] +// CHECK-NEXT: ret <8 x half> [[SUB_I]] +// float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) { return vsubq_f16(a, b); } -// CHECK-LABEL: test_vfma_f16 -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vfma_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfma_f16(a, b, c); } -// CHECK-LABEL: test_vfmaq_f16 -// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vfmaq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmaq_f16(a, b, c); } -// CHECK-LABEL: test_vfms_f16 -// CHECK: [[SUB:%.*]] = fneg <4 x half> %b -// CHECK: [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a) -// CHECK: ret <4 x half> [[ADD]] +// CHECK-LABEL: define dso_local <4 x half> @test_vfms_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x half> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) { return vfms_f16(a, b, c); } -// CHECK-LABEL: test_vfmsq_f16 -// CHECK: [[SUB:%.*]] = fneg <8 x half> %b -// CHECK: [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a) -// CHECK: ret <8 x half> [[ADD]] +// CHECK-LABEL: define dso_local <8 x half> @test_vfmsq_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <8 x half> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { return vfmsq_f16(a, b, c); } -// CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_lane_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x half> [[MUL]] +// float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_lane_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x half> [[MUL]] +// float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { return vmulq_lane_f16(a, b, 3); } -// CHECK-LABEL: test_vmul_n_f16 -// CHECK: [[TMP0:%.*]] = insertelement <4 x half> poison, half [[b:%.*]], i32 0 -// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[b]], i32 1 -// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[b]], i32 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[b]], i32 3 -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP3]] -// CHECK: ret <4 x half> [[MUL]] +// CHECK-LABEL: define dso_local <4 x half> @test_vmul_n_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]] +// CHECK-NEXT: ret <4 x half> [[MUL]] +// float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) { return vmul_n_f16(a, b); } -// CHECK-LABEL: test_vmulq_n_f16 -// CHECK: [[TMP0:%.*]] = insertelement <8 x half> poison, half [[b:%.*]], i32 0 -// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[b]], i32 1 -// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[b]], i32 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[b]], i32 3 -// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[b]], i32 4 -// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[b]], i32 5 -// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[b]], i32 6 -// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[b]], i32 7 -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP7]] -// CHECK: ret <8 x half> [[MUL]] +// CHECK-LABEL: define dso_local <8 x half> @test_vmulq_n_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]] +// CHECK-NEXT: ret <8 x half> [[MUL]] +// float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) { return vmulq_n_f16(a, b); } diff --git a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c index 947f42cdd0de9..baea770e2c3ca 100644 --- a/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c @@ -1,87 +1,101 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg,sroa \ +// RUN: | opt -S -passes=mem2reg,sroa,instcombine \ // RUN: | FileCheck %s // REQUIRES: arm-registered-target #include -// CHECK-LABEL: test_vmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { return vmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vmmlaq_u32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vmmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMMLA1_I]] +// uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { return vmmlaq_u32(r, a, b); } -// CHECK-LABEL: test_vusmmlaq_s32 -// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) -// CHECK: ret <4 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUSMMLA1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VUSMMLA1_I]] +// int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { return vusmmlaq_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_s32 -// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) -// CHECK: ret <2 x i32> [[VAL]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_s32(r, a, b); } -// CHECK-LABEL: test_vusdot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP3]]) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vusdot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[A]], <8 x i8> [[TMP1]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { return vusdot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudot_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> %r to <8 x i8> -// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP3]], <8 x i8> %a) -// CHECK: ret <2 x i32> [[OP]] +// CHECK-LABEL: define dso_local <2 x i32> @test_vsudot_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[R:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> [[R]], <8 x i8> [[TMP1]], <8 x i8> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VUSDOT1_I]] +// int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { return vsudot_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vusdotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vusdotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[A]], <16 x i8> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { return vusdotq_lane_s32(r, a, b, 0); } -// CHECK-LABEL: test_vsudotq_lane_s32 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> %r to <16 x i8> -// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %3, <16 x i8> %a) -// CHECK: ret <4 x i32> [[OP]] +// CHECK-LABEL: define dso_local <4 x i32> @test_vsudotq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[B]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VUSDOT1_I:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> [[R]], <16 x i8> [[TMP1]], <16 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VUSDOT1_I]] +// int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { return vsudotq_lane_s32(r, a, b, 0); } diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c index 2da2d3bc8d075..202350a805ece 100644 --- a/clang/test/CodeGen/arm64_vdupq_n_f64.c +++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c @@ -1,57 +1,66 @@ -// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg | FileCheck %s +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -o - -disable-O0-optnone -emit-llvm %s | opt -S -passes=mem2reg,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include // vdupq_n_f64 -> dup.2d v0, v0[0] +// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64( +// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[VECINIT1_I]] // -// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_n_f64(double noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 -// CHECK: ret <2 x double> [[VECINIT1_I]] float64x2_t test_vdupq_n_f64(float64_t w) { return vdupq_n_f64(w); } // might as well test this while we're here // vdupq_n_f32 -> dup.4s v0, v0[0] -// CHECK-LABEL: define{{.*}} <4 x float> @test_vdupq_n_f32(float noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32( +// CHECK-SAME: float noundef [[W:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[W]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vdupq_n_f32(float32_t w) { return vdupq_n_f32(w); } // vdupq_lane_f64 -> dup.2d v0, v0[0] -// CHECK-LABEL: define{{.*}} <2 x double> @test_vdupq_lane_f64(<1 x double> noundef %V) #0 { -// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x double> [[SHUFFLE]] +// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64( +// CHECK-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[V]], <1 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[LANE]] +// float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); } // vmovq_n_f64 -> dup Vd.2d,X0 -// CHECK-LABEL: define{{.*}} <2 x double> @test_vmovq_n_f64(double noundef %w) #0 { -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double %w, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1 -// CHECK: ret <2 x double> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64( +// CHECK-SAME: double noundef [[W:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[W]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x double> [[VECINIT_I]], <2 x double> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x double> [[VECINIT1_I]] +// float64x2_t test_vmovq_n_f64(float64_t w) { return vmovq_n_f64(w); } -// CHECK-LABEL: define{{.*}} <4 x half> @test_vmov_n_f16(ptr noundef %a1) #0 { -// CHECK: [[TMP0:%.*]] = load half, ptr %a1, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16( +// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A1]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vmov_n_f16(float16_t *a1) { return vmov_n_f16(*a1); } @@ -62,17 +71,14 @@ float64x1_t test_vmov_n_f64(float64_t a1) { } */ -// CHECK-LABEL: define{{.*}} <8 x half> @test_vmovq_n_f16(ptr noundef %a1) #0 { -// CHECK: [[TMP0:%.*]] = load half, ptr %a1, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16( +// CHECK-SAME: ptr noundef [[A1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A1]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vmovq_n_f16(float16_t *a1) { return vmovq_n_f16(*a1); } diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c index 9f43dd2be5af5..6177e01887c46 100644 --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -1,20011 +1,23268 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\ // RUN: -target-cpu swift \ // RUN: -target-feature +fullfp16 -ffreestanding \ // RUN: -flax-vector-conversions=none \ // RUN: -disable-O0-optnone -emit-llvm -o - %s \ -// RUN: | opt -S -passes=mem2reg | FileCheck %s +// RUN: | opt -S -passes=mem2reg,sroa,instcombine | FileCheck %s // REQUIRES: aarch64-registered-target || arm-registered-target #include -// CHECK-LABEL: @test_vaba_s8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vaba_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vaba_s8(a, b, c); } -// CHECK-LABEL: @test_vaba_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vaba_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vaba_s16(a, b, c); } -// CHECK-LABEL: @test_vaba_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vaba_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vaba_s32(a, b, c); } -// CHECK-LABEL: @test_vaba_u8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vaba_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[VABD_V_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vaba_u8(a, b, c); } -// CHECK-LABEL: @test_vaba_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD_V2_I_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vaba_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[VABD_V2_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vaba_u16(a, b, c); } -// CHECK-LABEL: @test_vaba_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD_V2_I_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vaba_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[VABD_V2_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vaba_u32(a, b, c); } -// CHECK-LABEL: @test_vabaq_s8( -// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vabaq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vabaq_s8(a, b, c); } -// CHECK-LABEL: @test_vabaq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %b, <8 x i16> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabaq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[B]], <8 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vabaq_s16(a, b, c); } -// CHECK-LABEL: @test_vabaq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %b, <4 x i32> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabaq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[B]], <4 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vabaq_s32(a, b, c); } -// CHECK-LABEL: @test_vabaq_u8( -// CHECK: [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vabaq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[VABDQ_V_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vabaq_u8(a, b, c); } -// CHECK-LABEL: @test_vabaq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %b, <8 x i16> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabaq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[B]], <8 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VABDQ_V2_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vabaq_u16(a, b, c); } -// CHECK-LABEL: @test_vabaq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %b, <4 x i32> %c) -// CHECK: [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VABDQ_V2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabaq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[B]], <4 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VABDQ_V2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vabaq_u32(a, b, c); } -// CHECK-LABEL: @test_vabal_s8( -// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vabal_s8(a, b, c); } -// CHECK-LABEL: @test_vabal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vabal_s16(a, b, c); } -// CHECK-LABEL: @test_vabal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vabal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vabal_s32(a, b, c); } -// CHECK-LABEL: @test_vabal_u8( -// CHECK: [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vabal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vabal_u8(a, b, c); } -// CHECK-LABEL: @test_vabal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I_I]] to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vabal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vabal_u16(a, b, c); } -// CHECK-LABEL: @test_vabal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I_I]] to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vabal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vabal_u32(a, b, c); } -// CHECK-LABEL: @test_vabd_s8( -// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VABD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vabd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_V_I]] +// int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) { return vabd_s8(a, b); } -// CHECK-LABEL: @test_vabd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VABD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vabd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VABD_V2_I]] +// int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) { return vabd_s16(a, b); } -// CHECK-LABEL: @test_vabd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vabd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VABD_V2_I]] +// int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) { return vabd_s32(a, b); } -// CHECK-LABEL: @test_vabd_u8( -// CHECK: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VABD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vabd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VABD_V_I]] +// uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) { return vabd_u8(a, b); } -// CHECK-LABEL: @test_vabd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VABD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vabd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VABD_V2_I]] +// uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) { return vabd_u16(a, b); } -// CHECK-LABEL: @test_vabd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vabd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VABD_V2_I]] +// uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) { return vabd_u32(a, b); } -// CHECK-LABEL: @test_vabd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VABD_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vabd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VABD_V2_I]] +// float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) { return vabd_f32(a, b); } -// CHECK-LABEL: @test_vabdq_s8( -// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VABDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vabdq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VABDQ_V_I]] +// int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) { return vabdq_s8(a, b); } -// CHECK-LABEL: @test_vabdq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VABDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VABDQ_V2_I]] +// int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) { return vabdq_s16(a, b); } -// CHECK-LABEL: @test_vabdq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VABDQ_V2_I]] +// int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) { return vabdq_s32(a, b); } -// CHECK-LABEL: @test_vabdq_u8( -// CHECK: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VABDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vabdq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VABDQ_V_I]] +// uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) { return vabdq_u8(a, b); } -// CHECK-LABEL: @test_vabdq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VABDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VABDQ_V2_I]] +// uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) { return vabdq_u16(a, b); } -// CHECK-LABEL: @test_vabdq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VABDQ_V2_I]] +// uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) { return vabdq_u32(a, b); } -// CHECK-LABEL: @test_vabdq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VABDQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vabdq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VABDQ_V2_I]] +// float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) { return vabdq_f32(a, b); } -// CHECK-LABEL: @test_vabdl_s8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) { return vabdl_s8(a, b); } -// CHECK-LABEL: @test_vabdl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) { return vabdl_s16(a, b); } -// CHECK-LABEL: @test_vabdl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define <2 x i64> @test_vabdl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) { return vabdl_s32(a, b); } -// CHECK-LABEL: @test_vabdl_u8( -// CHECK: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I_I]] +// CHECK-LABEL: define <8 x i16> @test_vabdl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) { return vabdl_u8(a, b); } -// CHECK-LABEL: @test_vabdl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I_I]] +// CHECK-LABEL: define <4 x i32> @test_vabdl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[VABD_V2_I_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) { return vabdl_u16(a, b); } -// CHECK-LABEL: @test_vabdl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I_I]] +// CHECK-LABEL: define <2 x i64> @test_vabdl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[VABD_V2_I_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) { return vabdl_u32(a, b); } -// CHECK-LABEL: @test_vabs_s8( -// CHECK: [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VABS_I]] +// CHECK-LABEL: define <8 x i8> @test_vabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VABS_I]] +// int8x8_t test_vabs_s8(int8x8_t a) { return vabs_s8(a); } -// CHECK-LABEL: @test_vabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> %a) -// CHECK: ret <4 x i16> [[VABS1_I]] +// CHECK-LABEL: define <4 x i16> @test_vabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VABS1_I]] +// int16x4_t test_vabs_s16(int16x4_t a) { return vabs_s16(a); } -// CHECK-LABEL: @test_vabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VABS1_I]] +// CHECK-LABEL: define <2 x i32> @test_vabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VABS1_I]] +// int32x2_t test_vabs_s32(int32x2_t a) { return vabs_s32(a); } -// CHECK-LABEL: @test_vabs_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VABS1_I]] +// CHECK-LABEL: define <2 x float> @test_vabs_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VABS1_I]] +// float32x2_t test_vabs_f32(float32x2_t a) { return vabs_f32(a); } -// CHECK-LABEL: @test_vabsq_s8( -// CHECK: [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VABS_I]] +// CHECK-LABEL: define <16 x i8> @test_vabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VABS_I]] +// int8x16_t test_vabsq_s8(int8x16_t a) { return vabsq_s8(a); } -// CHECK-LABEL: @test_vabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %a) -// CHECK: ret <8 x i16> [[VABS1_I]] +// CHECK-LABEL: define <8 x i16> @test_vabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VABS1_I]] +// int16x8_t test_vabsq_s16(int16x8_t a) { return vabsq_s16(a); } -// CHECK-LABEL: @test_vabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VABS1_I]] +// CHECK-LABEL: define <4 x i32> @test_vabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VABS1_I]] +// int32x4_t test_vabsq_s32(int32x4_t a) { return vabsq_s32(a); } -// CHECK-LABEL: @test_vabsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VABS1_I]] +// CHECK-LABEL: define <4 x float> @test_vabsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VABS1_I]] +// float32x4_t test_vabsq_f32(float32x4_t a) { return vabsq_f32(a); } -// CHECK-LABEL: @test_vadd_s8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) { return vadd_s8(a, b); } -// CHECK-LABEL: @test_vadd_s16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) { return vadd_s16(a, b); } -// CHECK-LABEL: @test_vadd_s32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) { return vadd_s32(a, b); } -// CHECK-LABEL: @test_vadd_s64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) { return vadd_s64(a, b); } -// CHECK-LABEL: @test_vadd_f32( -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, %b -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) { return vadd_f32(a, b); } -// CHECK-LABEL: @test_vadd_u8( -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) { return vadd_u8(a, b); } -// CHECK-LABEL: @test_vadd_u16( -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) { return vadd_u16(a, b); } -// CHECK-LABEL: @test_vadd_u32( -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) { return vadd_u32(a, b); } -// CHECK-LABEL: @test_vadd_u64( -// CHECK: [[ADD_I:%.*]] = add <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) { return vadd_u64(a, b); } -// CHECK-LABEL: @test_vaddq_s8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) { return vaddq_s8(a, b); } -// CHECK-LABEL: @test_vaddq_s16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) { return vaddq_s16(a, b); } -// CHECK-LABEL: @test_vaddq_s32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) { return vaddq_s32(a, b); } -// CHECK-LABEL: @test_vaddq_s64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) { return vaddq_s64(a, b); } -// CHECK-LABEL: @test_vaddq_f32( -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, %b -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vaddq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) { return vaddq_f32(a, b); } -// CHECK-LABEL: @test_vaddq_u8( -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) { return vaddq_u8(a, b); } -// CHECK-LABEL: @test_vaddq_u16( -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) { return vaddq_u16(a, b); } -// CHECK-LABEL: @test_vaddq_u32( -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) { return vaddq_u32(a, b); } -// CHECK-LABEL: @test_vaddq_u64( -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) { return vaddq_u64(a, b); } -// CHECK-LABEL: @test_vaddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) { return vaddhn_s16(a, b); } -// CHECK-LABEL: @test_vaddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) { return vaddhn_s32(a, b); } -// CHECK-LABEL: @test_vaddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) { return vaddhn_s64(a, b); } -// CHECK-LABEL: @test_vaddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <8 x i16> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) -// CHECK: [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VADDHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], splat (i16 8) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <8 x i16> [[VADDHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VADDHN2_I]] +// uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) { return vaddhn_u16(a, b); } -// CHECK-LABEL: @test_vaddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <4 x i32> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) -// CHECK: [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VADDHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], splat (i32 16) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <4 x i32> [[VADDHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VADDHN2_I]] +// uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) { return vaddhn_u32(a, b); } -// CHECK-LABEL: @test_vaddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VADDHN_I:%.*]] = add <2 x i64> %a, %b -// CHECK: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) -// CHECK: [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VADDHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VADDHN_I:%.*]] = add <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], splat (i64 32) +// CHECK-NEXT: [[VADDHN2_I:%.*]] = trunc nuw <2 x i64> [[VADDHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VADDHN2_I]] +// uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) { return vaddhn_u64(a, b); } -// CHECK-LABEL: @test_vaddl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <8 x i16> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) { return vaddl_s8(a, b); } -// CHECK-LABEL: @test_vaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <4 x i32> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) { return vaddl_s16(a, b); } -// CHECK-LABEL: @test_vaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nsw <2 x i64> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) { return vaddl_s32(a, b); } -// CHECK-LABEL: @test_vaddl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <8 x i16> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) { return vaddl_u8(a, b); } -// CHECK-LABEL: @test_vaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <4 x i32> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) { return vaddl_u16(a, b); } -// CHECK-LABEL: @test_vaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I5:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add nuw nsw <2 x i64> [[VMOVL_I5]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) { return vaddl_u32(a, b); } -// CHECK-LABEL: @test_vaddw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) { return vaddw_s8(a, b); } -// CHECK-LABEL: @test_vaddw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) { return vaddw_s16(a, b); } -// CHECK-LABEL: @test_vaddw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) { return vaddw_s32(a, b); } -// CHECK-LABEL: @test_vaddw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vaddw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) { return vaddw_u8(a, b); } -// CHECK-LABEL: @test_vaddw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vaddw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) { return vaddw_u16(a, b); } -// CHECK-LABEL: @test_vaddw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vaddw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMOVL_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) { return vaddw_u32(a, b); } -// CHECK-LABEL: @test_vand_s8( -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vand_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) { return vand_s8(a, b); } -// CHECK-LABEL: @test_vand_s16( -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vand_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) { return vand_s16(a, b); } -// CHECK-LABEL: @test_vand_s32( -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vand_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) { return vand_s32(a, b); } -// CHECK-LABEL: @test_vand_s64( -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vand_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) { return vand_s64(a, b); } -// CHECK-LABEL: @test_vand_u8( -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vand_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) { return vand_u8(a, b); } -// CHECK-LABEL: @test_vand_u16( -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vand_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) { return vand_u16(a, b); } -// CHECK-LABEL: @test_vand_u32( -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vand_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) { return vand_u32(a, b); } -// CHECK-LABEL: @test_vand_u64( -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vand_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) { return vand_u64(a, b); } -// CHECK-LABEL: @test_vandq_s8( -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vandq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) { return vandq_s8(a, b); } -// CHECK-LABEL: @test_vandq_s16( -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vandq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) { return vandq_s16(a, b); } -// CHECK-LABEL: @test_vandq_s32( -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vandq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) { return vandq_s32(a, b); } -// CHECK-LABEL: @test_vandq_s64( -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vandq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) { return vandq_s64(a, b); } -// CHECK-LABEL: @test_vandq_u8( -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vandq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) { return vandq_u8(a, b); } -// CHECK-LABEL: @test_vandq_u16( -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vandq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) { return vandq_u16(a, b); } -// CHECK-LABEL: @test_vandq_u32( -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vandq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) { return vandq_u32(a, b); } -// CHECK-LABEL: @test_vandq_u64( -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vandq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) { return vandq_u64(a, b); } -// CHECK-LABEL: @test_vbic_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vbic_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) { return vbic_s8(a, b); } -// CHECK-LABEL: @test_vbic_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vbic_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) { return vbic_s16(a, b); } -// CHECK-LABEL: @test_vbic_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vbic_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) { return vbic_s32(a, b); } -// CHECK-LABEL: @test_vbic_s64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vbic_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) { return vbic_s64(a, b); } -// CHECK-LABEL: @test_vbic_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[AND_I]] +// CHECK-LABEL: define <8 x i8> @test_vbic_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[AND_I]] +// uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) { return vbic_u8(a, b); } -// CHECK-LABEL: @test_vbic_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[AND_I]] +// CHECK-LABEL: define <4 x i16> @test_vbic_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[AND_I]] +// uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) { return vbic_u16(a, b); } -// CHECK-LABEL: @test_vbic_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[AND_I]] +// CHECK-LABEL: define <2 x i32> @test_vbic_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[AND_I]] +// uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) { return vbic_u32(a, b); } -// CHECK-LABEL: @test_vbic_u64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[AND_I]] +// CHECK-LABEL: define <1 x i64> @test_vbic_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[AND_I]] +// uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) { return vbic_u64(a, b); } -// CHECK-LABEL: @test_vbicq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vbicq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) { return vbicq_s8(a, b); } -// CHECK-LABEL: @test_vbicq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vbicq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) { return vbicq_s16(a, b); } -// CHECK-LABEL: @test_vbicq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vbicq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) { return vbicq_s32(a, b); } -// CHECK-LABEL: @test_vbicq_s64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vbicq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) { return vbicq_s64(a, b); } -// CHECK-LABEL: @test_vbicq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[AND_I]] +// CHECK-LABEL: define <16 x i8> @test_vbicq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[AND_I]] +// uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) { return vbicq_u8(a, b); } -// CHECK-LABEL: @test_vbicq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[AND_I]] +// CHECK-LABEL: define <8 x i16> @test_vbicq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[AND_I]] +// uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) { return vbicq_u16(a, b); } -// CHECK-LABEL: @test_vbicq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[AND_I]] +// CHECK-LABEL: define <4 x i32> @test_vbicq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[AND_I]] +// uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) { return vbicq_u32(a, b); } -// CHECK-LABEL: @test_vbicq_u64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[AND_I]] +// CHECK-LABEL: define <2 x i64> @test_vbicq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[AND_I:%.*]] = and <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[AND_I]] +// uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) { return vbicq_u64(a, b); } -// CHECK-LABEL: @test_vbsl_s8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) { return vbsl_s8(a, b, c); } -// CHECK-LABEL: @test_vbsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) { return vbsl_s16(a, b, c); } -// CHECK-LABEL: @test_vbsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define <2 x i32> @test_vbsl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) { return vbsl_s32(a, b, c); } -// CHECK-LABEL: @test_vbsl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define <1 x i64> @test_vbsl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) { return vbsl_s64(a, b, c); } -// CHECK-LABEL: @test_vbsl_u8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vbsl_u8(a, b, c); } -// CHECK-LABEL: @test_vbsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vbsl_u16(a, b, c); } -// CHECK-LABEL: @test_vbsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[TMP3]] +// CHECK-LABEL: define <2 x i32> @test_vbsl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP3]] +// uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vbsl_u32(a, b, c); } -// CHECK-LABEL: @test_vbsl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> -// CHECK: ret <1 x i64> [[TMP3]] +// CHECK-LABEL: define <1 x i64> @test_vbsl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]], <1 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP3]] +// uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) { return vbsl_u64(a, b, c); } -// CHECK-LABEL: @test_vbsl_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float> -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vbsl_f32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP3]] +// float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) { return vbsl_f32(a, b, c); } -// CHECK-LABEL: @test_vbsl_p8( -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VBSL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vbsl_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VBSL_V_I]] +// poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) { return vbsl_p8(a, b, c); } -// CHECK-LABEL: @test_vbsl_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP3]] +// CHECK-LABEL: define <4 x i16> @test_vbsl_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[C]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) { return vbsl_p16(a, b, c); } -// CHECK-LABEL: @test_vbslq_s8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) { return vbslq_s8(a, b, c); } -// CHECK-LABEL: @test_vbslq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return vbslq_s16(a, b, c); } -// CHECK-LABEL: @test_vbslq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define <4 x i32> @test_vbslq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return vbslq_s32(a, b, c); } -// CHECK-LABEL: @test_vbslq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define <2 x i64> @test_vbslq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return vbslq_s64(a, b, c); } -// CHECK-LABEL: @test_vbslq_u8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vbslq_u8(a, b, c); } -// CHECK-LABEL: @test_vbslq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vbslq_u16(a, b, c); } -// CHECK-LABEL: @test_vbslq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[TMP3]] +// CHECK-LABEL: define <4 x i32> @test_vbslq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vbslq_u32(a, b, c); } -// CHECK-LABEL: @test_vbslq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> -// CHECK: ret <2 x i64> [[TMP3]] +// CHECK-LABEL: define <2 x i64> @test_vbslq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x i64> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return vbslq_u64(a, b, c); } -// CHECK-LABEL: @test_vbslq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float> -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vbslq_f32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return vbslq_f32(a, b, c); } -// CHECK-LABEL: @test_vbslq_p8( -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) -// CHECK: ret <16 x i8> [[VBSLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vbslq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VBSLQ_V_I]] +// poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) { return vbslq_p8(a, b, c); } -// CHECK-LABEL: @test_vbslq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8> -// CHECK: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[TMP3]] +// CHECK-LABEL: define <8 x i16> @test_vbslq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[B]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[C]] to <16 x i8> +// CHECK-NEXT: [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) { return vbslq_p16(a, b, c); } -// CHECK-LABEL: @test_vcage_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x i32> [[VCAGE_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcage_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGE_V2_I]] +// uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) { return vcage_f32(a, b); } -// CHECK-LABEL: @test_vcageq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x i32> [[VCAGEQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcageq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGEQ_V2_I]] +// uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) { return vcageq_f32(a, b); } -// CHECK-LABEL: @test_vcagt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: ret <2 x i32> [[VCAGT_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcagt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VCAGT_V2_I]] +// uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) { return vcagt_f32(a, b); } -// CHECK-LABEL: @test_vcagtq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: ret <4 x i32> [[VCAGTQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VCAGTQ_V2_I]] +// uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) { return vcagtq_f32(a, b); } -// CHECK-LABEL: @test_vcale_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> %b, <2 x float> %a) -// CHECK: ret <2 x i32> [[VCALE_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcale_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCALE_V2_I]] +// uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) { return vcale_f32(a, b); } -// CHECK-LABEL: @test_vcaleq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> %b, <4 x float> %a) -// CHECK: ret <4 x i32> [[VCALEQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCALEQ_V2_I]] +// uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) { return vcaleq_f32(a, b); } -// CHECK-LABEL: @test_vcalt_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> %b, <2 x float> %a) -// CHECK: ret <2 x i32> [[VCALT_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vcalt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[B]], <2 x float> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCALT_V2_I]] +// uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) { return vcalt_f32(a, b); } -// CHECK-LABEL: @test_vcaltq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> %b, <4 x float> %a) -// CHECK: ret <4 x i32> [[VCALTQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[B]], <4 x float> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCALTQ_V2_I]] +// uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) { return vcaltq_f32(a, b); } -// CHECK-LABEL: @test_vceq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) { return vceq_s8(a, b); } -// CHECK-LABEL: @test_vceq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vceq_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) { return vceq_s16(a, b); } -// CHECK-LABEL: @test_vceq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) { return vceq_s32(a, b); } -// CHECK-LABEL: @test_vceq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) { return vceq_f32(a, b); } -// CHECK-LABEL: @test_vceq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) { return vceq_u8(a, b); } -// CHECK-LABEL: @test_vceq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vceq_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) { return vceq_u16(a, b); } -// CHECK-LABEL: @test_vceq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vceq_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) { return vceq_u32(a, b); } -// CHECK-LABEL: @test_vceq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vceq_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) { return vceq_p8(a, b); } -// CHECK-LABEL: @test_vceqq_s8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) { return vceqq_s8(a, b); } -// CHECK-LABEL: @test_vceqq_s16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vceqq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) { return vceqq_s16(a, b); } -// CHECK-LABEL: @test_vceqq_s32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) { return vceqq_s32(a, b); } -// CHECK-LABEL: @test_vceqq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oeq <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) { return vceqq_f32(a, b); } -// CHECK-LABEL: @test_vceqq_u8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) { return vceqq_u8(a, b); } -// CHECK-LABEL: @test_vceqq_u16( -// CHECK: [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vceqq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) { return vceqq_u16(a, b); } -// CHECK-LABEL: @test_vceqq_u32( -// CHECK: [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vceqq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) { return vceqq_u32(a, b); } -// CHECK-LABEL: @test_vceqq_p8( -// CHECK: [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vceqq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp eq <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) { return vceqq_p8(a, b); } -// CHECK-LABEL: @test_vcge_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcge_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) { return vcge_s8(a, b); } -// CHECK-LABEL: @test_vcge_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcge_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) { return vcge_s16(a, b); } -// CHECK-LABEL: @test_vcge_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) { return vcge_s32(a, b); } -// CHECK-LABEL: @test_vcge_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) { return vcge_f32(a, b); } -// CHECK-LABEL: @test_vcge_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcge_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) { return vcge_u8(a, b); } -// CHECK-LABEL: @test_vcge_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcge_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) { return vcge_u16(a, b); } -// CHECK-LABEL: @test_vcge_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcge_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) { return vcge_u32(a, b); } -// CHECK-LABEL: @test_vcgeq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) { return vcgeq_s8(a, b); } -// CHECK-LABEL: @test_vcgeq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) { return vcgeq_s16(a, b); } -// CHECK-LABEL: @test_vcgeq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sge <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) { return vcgeq_s32(a, b); } -// CHECK-LABEL: @test_vcgeq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp oge <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) { return vcgeq_f32(a, b); } -// CHECK-LABEL: @test_vcgeq_u8( -// CHECK: [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) { return vcgeq_u8(a, b); } -// CHECK-LABEL: @test_vcgeq_u16( -// CHECK: [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) { return vcgeq_u16(a, b); } -// CHECK-LABEL: @test_vcgeq_u32( -// CHECK: [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp uge <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) { return vcgeq_u32(a, b); } -// CHECK-LABEL: @test_vcgt_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcgt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) { return vcgt_s8(a, b); } -// CHECK-LABEL: @test_vcgt_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcgt_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) { return vcgt_s16(a, b); } -// CHECK-LABEL: @test_vcgt_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) { return vcgt_s32(a, b); } -// CHECK-LABEL: @test_vcgt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) { return vcgt_f32(a, b); } -// CHECK-LABEL: @test_vcgt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcgt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) { return vcgt_u8(a, b); } -// CHECK-LABEL: @test_vcgt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcgt_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) { return vcgt_u16(a, b); } -// CHECK-LABEL: @test_vcgt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcgt_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) { return vcgt_u32(a, b); } -// CHECK-LABEL: @test_vcgtq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) { return vcgtq_s8(a, b); } -// CHECK-LABEL: @test_vcgtq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) { return vcgtq_s16(a, b); } -// CHECK-LABEL: @test_vcgtq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) { return vcgtq_s32(a, b); } -// CHECK-LABEL: @test_vcgtq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ogt <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) { return vcgtq_f32(a, b); } -// CHECK-LABEL: @test_vcgtq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) { return vcgtq_u8(a, b); } -// CHECK-LABEL: @test_vcgtq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) { return vcgtq_u16(a, b); } -// CHECK-LABEL: @test_vcgtq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) { return vcgtq_u32(a, b); } -// CHECK-LABEL: @test_vcle_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcle_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) { return vcle_s8(a, b); } -// CHECK-LABEL: @test_vcle_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcle_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) { return vcle_s16(a, b); } -// CHECK-LABEL: @test_vcle_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) { return vcle_s32(a, b); } -// CHECK-LABEL: @test_vcle_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) { return vcle_f32(a, b); } -// CHECK-LABEL: @test_vcle_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vcle_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) { return vcle_u8(a, b); } -// CHECK-LABEL: @test_vcle_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vcle_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) { return vcle_u16(a, b); } -// CHECK-LABEL: @test_vcle_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcle_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) { return vcle_u32(a, b); } -// CHECK-LABEL: @test_vcleq_s8( -// CHECK: [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcleq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) { return vcleq_s8(a, b); } -// CHECK-LABEL: @test_vcleq_s16( -// CHECK: [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcleq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) { return vcleq_s16(a, b); } -// CHECK-LABEL: @test_vcleq_s32( -// CHECK: [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp sle <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) { return vcleq_s32(a, b); } -// CHECK-LABEL: @test_vcleq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp ole <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) { return vcleq_f32(a, b); } -// CHECK-LABEL: @test_vcleq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcleq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) { return vcleq_u8(a, b); } -// CHECK-LABEL: @test_vcleq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcleq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) { return vcleq_u16(a, b); } -// CHECK-LABEL: @test_vcleq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcleq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ule <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) { return vcleq_u32(a, b); } -// CHECK-LABEL: @test_vcls_s8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_s8(int8x8_t a) { return vcls_s8(a); } -// CHECK-LABEL: @test_vcls_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCLS_V1_I]] +// int16x4_t test_vcls_s16(int16x4_t a) { return vcls_s16(a); } -// CHECK-LABEL: @test_vcls_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCLS_V1_I]] +// int32x2_t test_vcls_s32(int32x2_t a) { return vcls_s32(a); } -// CHECK-LABEL: @test_vcls_u8( -// CHECK: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCLS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCLS_V_I]] +// int8x8_t test_vcls_u8(uint8x8_t a) { return vcls_u8(a); } -// CHECK-LABEL: @test_vcls_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VCLS_V1_I]] +// int16x4_t test_vcls_u16(uint16x4_t a) { return vcls_u16(a); } -// CHECK-LABEL: @test_vcls_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %a) -// CHECK: [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VCLS_V1_I]] +// int32x2_t test_vcls_u32(uint32x2_t a) { return vcls_u32(a); } -// CHECK-LABEL: @test_vclsq_s8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_s8(int8x16_t a) { return vclsq_s8(a); } -// CHECK-LABEL: @test_vclsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCLSQ_V1_I]] +// int16x8_t test_vclsq_s16(int16x8_t a) { return vclsq_s16(a); } -// CHECK-LABEL: @test_vclsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCLSQ_V1_I]] +// int32x4_t test_vclsq_s32(int32x4_t a) { return vclsq_s32(a); } -// CHECK-LABEL: @test_vclsq_u8( -// CHECK: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCLSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCLSQ_V_I]] +// int8x16_t test_vclsq_u8(uint8x16_t a) { return vclsq_u8(a); } -// CHECK-LABEL: @test_vclsq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VCLSQ_V1_I]] +// int16x8_t test_vclsq_u16(uint16x8_t a) { return vclsq_u16(a); } -// CHECK-LABEL: @test_vclsq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %a) -// CHECK: [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VCLSQ_V1_I]] +// int32x4_t test_vclsq_u32(uint32x4_t a) { return vclsq_u32(a); } -// CHECK-LABEL: @test_vclt_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vclt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) { return vclt_s8(a, b); } -// CHECK-LABEL: @test_vclt_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vclt_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) { return vclt_s16(a, b); } -// CHECK-LABEL: @test_vclt_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) { return vclt_s32(a, b); } -// CHECK-LABEL: @test_vclt_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <2 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) { return vclt_f32(a, b); } -// CHECK-LABEL: @test_vclt_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[SEXT_I]] +// CHECK-LABEL: define <8 x i8> @test_vclt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[SEXT_I]] +// uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) { return vclt_u8(a, b); } -// CHECK-LABEL: @test_vclt_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[SEXT_I]] +// CHECK-LABEL: define <4 x i16> @test_vclt_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[SEXT_I]] +// uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) { return vclt_u16(a, b); } -// CHECK-LABEL: @test_vclt_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[SEXT_I]] +// CHECK-LABEL: define <2 x i32> @test_vclt_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SEXT_I]] +// uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) { return vclt_u32(a, b); } -// CHECK-LABEL: @test_vcltq_s8( -// CHECK: [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcltq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) { return vcltq_s8(a, b); } -// CHECK-LABEL: @test_vcltq_s16( -// CHECK: [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcltq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) { return vcltq_s16(a, b); } -// CHECK-LABEL: @test_vcltq_s32( -// CHECK: [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp slt <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) { return vcltq_s32(a, b); } -// CHECK-LABEL: @test_vcltq_f32( -// CHECK: [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt <4 x float> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) { return vcltq_f32(a, b); } -// CHECK-LABEL: @test_vcltq_u8( -// CHECK: [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> -// CHECK: ret <16 x i8> [[SEXT_I]] +// CHECK-LABEL: define <16 x i8> @test_vcltq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[SEXT_I]] +// uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) { return vcltq_u8(a, b); } -// CHECK-LABEL: @test_vcltq_u16( -// CHECK: [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> -// CHECK: ret <8 x i16> [[SEXT_I]] +// CHECK-LABEL: define <8 x i16> @test_vcltq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[SEXT_I]] +// uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) { return vcltq_u16(a, b); } -// CHECK-LABEL: @test_vcltq_u32( -// CHECK: [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b -// CHECK: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> -// CHECK: ret <4 x i32> [[SEXT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcltq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp ult <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SEXT_I]] +// uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) { return vcltq_u32(a, b); } -// CHECK-LABEL: @test_vclz_s8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vclz_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vclz_s8(int8x8_t a) { return vclz_s8(a); } -// CHECK-LABEL: @test_vclz_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vclz_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// int16x4_t test_vclz_s16(int16x4_t a) { return vclz_s16(a); } -// CHECK-LABEL: @test_vclz_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vclz_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// int32x2_t test_vclz_s32(int32x2_t a) { return vclz_s32(a); } -// CHECK-LABEL: @test_vclz_u8( -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vclz_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// uint8x8_t test_vclz_u8(uint8x8_t a) { return vclz_u8(a); } -// CHECK-LABEL: @test_vclz_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vclz_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// uint16x4_t test_vclz_u16(uint16x4_t a) { return vclz_u16(a); } -// CHECK-LABEL: @test_vclz_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vclz_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// uint32x2_t test_vclz_u32(uint32x2_t a) { return vclz_u32(a); } -// CHECK-LABEL: @test_vclzq_s8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclzq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// int8x16_t test_vclzq_s8(int8x16_t a) { return vclzq_s8(a); } -// CHECK-LABEL: @test_vclzq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclzq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i16> [[VCLZQ_V1_I]] +// int16x8_t test_vclzq_s16(int16x8_t a) { return vclzq_s16(a); } -// CHECK-LABEL: @test_vclzq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclzq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i32> [[VCLZQ_V1_I]] +// int32x4_t test_vclzq_s32(int32x4_t a) { return vclzq_s32(a); } -// CHECK-LABEL: @test_vclzq_u8( -// CHECK: [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) -// CHECK: ret <16 x i8> [[VCLZQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vclzq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[A]], i1 false) +// CHECK-NEXT: ret <16 x i8> [[VCLZQ_V_I]] +// uint8x16_t test_vclzq_u8(uint8x16_t a) { return vclzq_u8(a); } -// CHECK-LABEL: @test_vclzq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vclzq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i16 0, 17) <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[A]], i1 false) +// CHECK-NEXT: ret <8 x i16> [[VCLZQ_V1_I]] +// uint16x8_t test_vclzq_u16(uint16x8_t a) { return vclzq_u16(a); } -// CHECK-LABEL: @test_vclzq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) -// CHECK: [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VCLZQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vclzq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCLZQ_V1_I:%.*]] = call range(i32 0, 33) <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[A]], i1 false) +// CHECK-NEXT: ret <4 x i32> [[VCLZQ_V1_I]] +// uint32x4_t test_vclzq_u32(uint32x4_t a) { return vclzq_u32(a); } -// CHECK-LABEL: @test_vcnt_u8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// uint8x8_t test_vcnt_u8(uint8x8_t a) { return vcnt_u8(a); } -// CHECK-LABEL: @test_vcnt_s8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// int8x8_t test_vcnt_s8(int8x8_t a) { return vcnt_s8(a); } -// CHECK-LABEL: @test_vcnt_p8( -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcnt_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcnt_p8(poly8x8_t a) { return vcnt_p8(a); } -// CHECK-LABEL: @test_vcntq_u8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// uint8x16_t test_vcntq_u8(uint8x16_t a) { return vcntq_u8(a); } -// CHECK-LABEL: @test_vcntq_s8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// int8x16_t test_vcntq_s8(int8x16_t a) { return vcntq_s8(a); } -// CHECK-LABEL: @test_vcntq_p8( -// CHECK: [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VCNTQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vcntq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCNTQ_V_I:%.*]] = call range(i8 0, 9) <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VCNTQ_V_I]] +// poly8x16_t test_vcntq_p8(poly8x16_t a) { return vcntq_p8(a); } -// CHECK-LABEL: @test_vcombine_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) { return vcombine_s8(a, b); } -// CHECK-LABEL: @test_vcombine_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) { return vcombine_s16(a, b); } -// CHECK-LABEL: @test_vcombine_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vcombine_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) { return vcombine_s32(a, b); } -// CHECK-LABEL: @test_vcombine_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i64> @test_vcombine_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) { return vcombine_s64(a, b); } -// CHECK-LABEL: @test_vcombine_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> -// CHECK: ret <8 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x half> @test_vcombine_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x half> [[SHUFFLE_I]] +// float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); } -// CHECK-LABEL: @test_vcombine_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x float> @test_vcombine_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) { return vcombine_f32(a, b); } -// CHECK-LABEL: @test_vcombine_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) { return vcombine_u8(a, b); } -// CHECK-LABEL: @test_vcombine_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) { return vcombine_u16(a, b); } -// CHECK-LABEL: @test_vcombine_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vcombine_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) { return vcombine_u32(a, b); } -// CHECK-LABEL: @test_vcombine_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i64> @test_vcombine_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[SHUFFLE_I]] +// uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) { return vcombine_u64(a, b); } -// CHECK-LABEL: @test_vcombine_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vcombine_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) { return vcombine_p8(a, b); } -// CHECK-LABEL: @test_vcombine_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vcombine_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) { return vcombine_p16(a, b); } -// CHECK-LABEL: @test_vcreate_s8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_s8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vcreate_s8(uint64_t a) { return vclz_s8(vcreate_s8(a)); } -// CHECK-LABEL: @test_vcreate_imm -// CHECK: [[RES:%.*]] = bitcast i64 0 to <4 x i16> -// CHECK: ret <4 x i16> [[RES]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_imm( +// CHECK-SAME: ) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> zeroinitializer +// int16x4_t test_vcreate_imm(void) { return vcreate_s16(0); } -// CHECK-LABEL: @test_vcreate_s16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_s16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// int16x4_t test_vcreate_s16(uint64_t a) { return vclz_s16(vcreate_s16(a)); } -// CHECK-LABEL: @test_vcreate_s32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_s32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// int32x2_t test_vcreate_s32(uint64_t a) { return vclz_s32(vcreate_s32(a)); } -// CHECK-LABEL: @test_vcreate_f16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vcreate_f16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vcreate_f16(uint64_t a) { return vcreate_f16(a); } -// CHECK-LABEL: @test_vcreate_f32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vcreate_f32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vcreate_f32(uint64_t a) { return vcreate_f32(a); } -// CHECK-LABEL: @test_vcreate_u8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) -// CHECK: ret <8 x i8> [[VCLZ_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_u8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCLZ_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) +// CHECK-NEXT: ret <8 x i8> [[VCLZ_V_I]] +// int8x8_t test_vcreate_u8(uint64_t a) { return vclz_s8((int8x8_t)vcreate_u8(a)); } -// CHECK-LABEL: @test_vcreate_u16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VCLZ_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_u16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <4 x i16> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i16 0, 17) <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[TMP0]], i1 false) +// CHECK-NEXT: ret <4 x i16> [[VCLZ_V1_I]] +// int16x4_t test_vcreate_u16(uint64_t a) { return vclz_s16((int16x4_t)vcreate_u16(a)); } -// CHECK-LABEL: @test_vcreate_u32( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <2 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> -// CHECK: [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) -// CHECK: [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VCLZ_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vcreate_u32( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <2 x i32> +// CHECK-NEXT: [[VCLZ_V1_I:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[TMP0]], i1 false) +// CHECK-NEXT: ret <2 x i32> [[VCLZ_V1_I]] +// int32x2_t test_vcreate_u32(uint64_t a) { return vclz_s32((int32x2_t)vcreate_u32(a)); } -// CHECK-LABEL: @test_vcreate_u64( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vcreate_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[TMP0]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vcreate_u64(uint64_t a) { uint64x1_t tmp = vcreate_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: @test_vcreate_p8( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <8 x i8> -// CHECK: [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) -// CHECK: ret <8 x i8> [[VCNT_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vcreate_p8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VCNT_V_I:%.*]] = call range(i8 0, 9) <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x i8> [[VCNT_V_I]] +// poly8x8_t test_vcreate_p8(uint64_t a) { return vcnt_p8(vcreate_p8(a)); } -// CHECK-LABEL: @test_vcreate_p16( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <4 x i16> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8> -// CHECK: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) -// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vcreate_p16( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP3]] +// poly16x4_t test_vcreate_p16(uint64_t a) { poly16x4_t tmp = vcreate_p16(a); return vbsl_p16((uint16x4_t)tmp, tmp, tmp); } -// CHECK-LABEL: @test_vcreate_s64( -// CHECK: [[TMP0:%.*]] = bitcast i64 %a to <1 x i64> -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vcreate_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[TMP0]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vcreate_s64(uint64_t a) { int64x1_t tmp = vcreate_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vcvt_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %a) -// CHECK: [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half> -// CHECK: ret <4 x half> [[TMP1]] +// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[A]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vcvt_f16_f32(float32x4_t a) { return vcvt_f16_f32(a); } -// CHECK-LABEL: @test_vcvt_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_s32(int32x2_t a) { return vcvt_f32_s32(a); } -// CHECK-LABEL: @test_vcvt_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[VCVT_I]] +// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[VCVT_I]] +// float32x2_t test_vcvt_f32_u32(uint32x2_t a) { return vcvt_f32_u32(a); } -// CHECK-LABEL: @test_vcvtq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = sitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = sitofp <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_s32(int32x4_t a) { return vcvtq_f32_s32(a); } -// CHECK-LABEL: @test_vcvtq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = uitofp <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[VCVT_I]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = uitofp <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[VCVT_I]] +// float32x4_t test_vcvtq_f32_u32(uint32x4_t a) { return vcvtq_f32_u32(a); } -// CHECK-LABEL: @test_vcvt_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) -// CHECK: [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VCVT_F32_F161_I]] +// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_F32_F16_I:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) +// CHECK-NEXT: ret <4 x float> [[VCVT_F32_F161_I]] +// float32x4_t test_vcvt_f32_f16(float16x4_t a) { return vcvt_f32_f16(a); } -// CHECK-LABEL: @test_vcvt_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[A]], i32 1) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_s32(int32x2_t a) { return vcvt_n_f32_s32(a, 1); } -// CHECK-LABEL: @test_vcvt_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1) -// CHECK: ret <2 x float> [[VCVT_N1]] +// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[A]], i32 1) +// CHECK-NEXT: ret <2 x float> [[VCVT_N1]] +// float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) { return vcvt_n_f32_u32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[A]], i32 3) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) { return vcvtq_n_f32_s32(a, 3); } -// CHECK-LABEL: @test_vcvtq_n_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3) -// CHECK: ret <4 x float> [[VCVT_N1]] +// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[A]], i32 3) +// CHECK-NEXT: ret <4 x float> [[VCVT_N1]] +// float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) { return vcvtq_n_f32_u32(a, 3); } -// CHECK-LABEL: @test_vcvt_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[A]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// int32x2_t test_vcvt_n_s32_f32(float32x2_t a) { return vcvt_n_s32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[A]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) { return vcvtq_n_s32_f32(a, 3); } -// CHECK-LABEL: @test_vcvt_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1) -// CHECK: ret <2 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[A]], i32 1) +// CHECK-NEXT: ret <2 x i32> [[VCVT_N1]] +// uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) { return vcvt_n_u32_f32(a, 1); } -// CHECK-LABEL: @test_vcvtq_n_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3) -// CHECK: ret <4 x i32> [[VCVT_N1]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[A]], i32 3) +// CHECK-NEXT: ret <4 x i32> [[VCVT_N1]] +// uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) { return vcvtq_n_u32_f32(a, 3); } -// CHECK-LABEL: @test_vcvt_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fptosi <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VCVT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCVT_I]] +// int32x2_t test_vcvt_s32_f32(float32x2_t a) { return vcvt_s32_f32(a); } -// CHECK-LABEL: @test_vcvtq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptosi <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VCVT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptosi <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCVT_I]] +// int32x4_t test_vcvtq_s32_f32(float32x4_t a) { return vcvtq_s32_f32(a); } -// CHECK-LABEL: @test_vcvt_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VCVT_I:%.*]] = fptoui <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VCVT_I]] +// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VCVT_I]] +// uint32x2_t test_vcvt_u32_f32(float32x2_t a) { return vcvt_u32_f32(a); } -// CHECK-LABEL: @test_vcvtq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VCVT_I:%.*]] = fptoui <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VCVT_I]] +// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VCVT_I:%.*]] = fptoui <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VCVT_I]] +// uint32x4_t test_vcvtq_u32_f32(float32x4_t a) { return vcvtq_u32_f32(a); } -// CHECK-LABEL: @test_vdup_lane_u8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vdup_lane_u8(uint8x8_t a) { return vdup_lane_u8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_s8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vdup_lane_s8(int8x8_t a) { return vdup_lane_s8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_p8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vdup_lane_p8(poly8x8_t a) { return vdup_lane_p8(a, 7); } -// CHECK-LABEL: @test_vdup_lane_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } -// CHECK-LABEL: @test_vdup_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_u8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vdupq_lane_u8(uint8x8_t a) { return vdupq_lane_u8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_s8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vdupq_lane_s8(int8x8_t a) { return vdupq_lane_s8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } -// CHECK-LABEL: @test_vdupq_lane_p8( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vdupq_lane_p8(poly8x8_t a) { return vdupq_lane_p8(a, 7); } -// CHECK-LABEL: @test_vdupq_lane_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } -// CHECK-LABEL: @test_vdupq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } -// CHECK-LABEL: @test_vdup_lane_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } -// CHECK-LABEL: @test_vdup_lane_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } -// CHECK-LABEL: @test_vdupq_lane_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } -// CHECK-LABEL: @test_vdupq_lane_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x i64> [[A]], <1 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } -// CHECK-LABEL: @test_vdup_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// uint8x8_t test_vdup_n_u8(uint8_t a) { return vdup_n_u8(a); } -// CHECK-LABEL: @test_vdup_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// uint16x4_t test_vdup_n_u16(uint16_t a) { return vdup_n_u16(a); } -// CHECK-LABEL: @test_vdup_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// uint32x2_t test_vdup_n_u32(uint32_t a) { return vdup_n_u32(a); } -// CHECK-LABEL: @test_vdup_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// int8x8_t test_vdup_n_s8(int8_t a) { return vdup_n_s8(a); } -// CHECK-LABEL: @test_vdup_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// int16x4_t test_vdup_n_s16(int16_t a) { return vdup_n_s16(a); } -// CHECK-LABEL: @test_vdup_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// int32x2_t test_vdup_n_s32(int32_t a) { return vdup_n_s32(a); } -// CHECK-LABEL: @test_vdup_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// poly8x8_t test_vdup_n_p8(poly8_t a) { return vdup_n_p8(a); } -// CHECK-LABEL: @test_vdup_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// poly16x4_t test_vdup_n_p16(poly16_t a) { return vdup_n_p16(a); } -// CHECK-LABEL: @test_vdup_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vdup_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vdup_n_f16(float16_t *a) { return vdup_n_f16(*a); } -// CHECK-LABEL: @test_vdup_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: ret <2 x float> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x float> @test_vdup_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[VECINIT1_I]] +// float32x2_t test_vdup_n_f32(float32_t a) { return vdup_n_f32(a); } -// CHECK-LABEL: @test_vdupq_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// uint8x16_t test_vdupq_n_u8(uint8_t a) { return vdupq_n_u8(a); } -// CHECK-LABEL: @test_vdupq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// uint16x8_t test_vdupq_n_u16(uint16_t a) { return vdupq_n_u16(a); } -// CHECK-LABEL: @test_vdupq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// uint32x4_t test_vdupq_n_u32(uint32_t a) { return vdupq_n_u32(a); } -// CHECK-LABEL: @test_vdupq_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// int8x16_t test_vdupq_n_s8(int8_t a) { return vdupq_n_s8(a); } -// CHECK-LABEL: @test_vdupq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// int16x8_t test_vdupq_n_s16(int16_t a) { return vdupq_n_s16(a); } -// CHECK-LABEL: @test_vdupq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// int32x4_t test_vdupq_n_s32(int32_t a) { return vdupq_n_s32(a); } -// CHECK-LABEL: @test_vdupq_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// poly8x16_t test_vdupq_n_p8(poly8_t a) { return vdupq_n_p8(a); } -// CHECK-LABEL: @test_vdupq_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// poly16x8_t test_vdupq_n_p16(poly16_t a) { return vdupq_n_p16(a); } -// CHECK-LABEL: @test_vdupq_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vdupq_n_f16(float16_t *a) { return vdupq_n_f16(*a); } -// CHECK-LABEL: @test_vdupq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vdupq_n_f32(float32_t a) { return vdupq_n_f32(a); } -// CHECK-LABEL: @test_vdup_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vdup_n_s64(int64_t a) { int64x1_t tmp = vdup_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdup_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vdup_n_u64(uint64_t a) { int64x1_t tmp = (int64x1_t)vdup_n_u64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdupq_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[TMP0:%.*]] = shl <2 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: [[ADD_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vdupq_n_s64(int64_t a) { int64x2_t tmp = vdupq_n_s64(a); return vaddq_s64(tmp, tmp); } -// CHECK-LABEL: @test_vdupq_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[TMP0:%.*]] = shl <2 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: [[ADD_I:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vdupq_n_u64(uint64_t a) { uint64x2_t tmp = vdupq_n_u64(a); return vaddq_u64(tmp, tmp); } -// CHECK-LABEL: @test_veor_s8( -// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[XOR_I]] +// CHECK-LABEL: define <8 x i8> @test_veor_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[XOR_I]] +// int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) { return veor_s8(a, b); } -// CHECK-LABEL: @test_veor_s16( -// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[XOR_I]] +// CHECK-LABEL: define <4 x i16> @test_veor_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[XOR_I]] +// int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) { return veor_s16(a, b); } -// CHECK-LABEL: @test_veor_s32( -// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[XOR_I]] +// CHECK-LABEL: define <2 x i32> @test_veor_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[XOR_I]] +// int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) { return veor_s32(a, b); } -// CHECK-LABEL: @test_veor_s64( -// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[XOR_I]] +// CHECK-LABEL: define <1 x i64> @test_veor_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[XOR_I]] +// int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) { return veor_s64(a, b); } -// CHECK-LABEL: @test_veor_u8( -// CHECK: [[XOR_I:%.*]] = xor <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[XOR_I]] +// CHECK-LABEL: define <8 x i8> @test_veor_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[XOR_I]] +// uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) { return veor_u8(a, b); } -// CHECK-LABEL: @test_veor_u16( -// CHECK: [[XOR_I:%.*]] = xor <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[XOR_I]] +// CHECK-LABEL: define <4 x i16> @test_veor_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[XOR_I]] +// uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) { return veor_u16(a, b); } -// CHECK-LABEL: @test_veor_u32( -// CHECK: [[XOR_I:%.*]] = xor <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[XOR_I]] +// CHECK-LABEL: define <2 x i32> @test_veor_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[XOR_I]] +// uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) { return veor_u32(a, b); } -// CHECK-LABEL: @test_veor_u64( -// CHECK: [[XOR_I:%.*]] = xor <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[XOR_I]] +// CHECK-LABEL: define <1 x i64> @test_veor_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[XOR_I]] +// uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) { return veor_u64(a, b); } -// CHECK-LABEL: @test_veorq_s8( -// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[XOR_I]] +// CHECK-LABEL: define <16 x i8> @test_veorq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[XOR_I]] +// int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) { return veorq_s8(a, b); } -// CHECK-LABEL: @test_veorq_s16( -// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[XOR_I]] +// CHECK-LABEL: define <8 x i16> @test_veorq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[XOR_I]] +// int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) { return veorq_s16(a, b); } -// CHECK-LABEL: @test_veorq_s32( -// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[XOR_I]] +// CHECK-LABEL: define <4 x i32> @test_veorq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[XOR_I]] +// int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) { return veorq_s32(a, b); } -// CHECK-LABEL: @test_veorq_s64( -// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[XOR_I]] +// CHECK-LABEL: define <2 x i64> @test_veorq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[XOR_I]] +// int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) { return veorq_s64(a, b); } -// CHECK-LABEL: @test_veorq_u8( -// CHECK: [[XOR_I:%.*]] = xor <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[XOR_I]] +// CHECK-LABEL: define <16 x i8> @test_veorq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[XOR_I]] +// uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) { return veorq_u8(a, b); } -// CHECK-LABEL: @test_veorq_u16( -// CHECK: [[XOR_I:%.*]] = xor <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[XOR_I]] +// CHECK-LABEL: define <8 x i16> @test_veorq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[XOR_I]] +// uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) { return veorq_u16(a, b); } -// CHECK-LABEL: @test_veorq_u32( -// CHECK: [[XOR_I:%.*]] = xor <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[XOR_I]] +// CHECK-LABEL: define <4 x i32> @test_veorq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[XOR_I]] +// uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) { return veorq_u32(a, b); } -// CHECK-LABEL: @test_veorq_u64( -// CHECK: [[XOR_I:%.*]] = xor <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[XOR_I]] +// CHECK-LABEL: define <2 x i64> @test_veorq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[XOR_I:%.*]] = xor <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[XOR_I]] +// uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) { return veorq_u64(a, b); } -// CHECK-LABEL: @test_vext_s8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) { return vext_s8(a, b, 7); } -// CHECK-LABEL: @test_vext_u8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) { return vext_u8(a, b, 7); } -// CHECK-LABEL: @test_vext_p8( -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: ret <8 x i8> [[VEXT]] +// CHECK-LABEL: define <8 x i8> @test_vext_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) { return vext_p8(a, b, 7); } -// CHECK-LABEL: @test_vext_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) { return vext_s16(a, b, 3); } -// CHECK-LABEL: @test_vext_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) { return vext_u16(a, b, 3); } -// CHECK-LABEL: @test_vext_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i16> [[VEXT]] +// CHECK-LABEL: define <4 x i16> @test_vext_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[VEXT]] +// poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) { return vext_p16(a, b, 3); } -// CHECK-LABEL: @test_vext_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define <2 x i32> @test_vext_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) { return vext_s32(a, b, 1); } -// CHECK-LABEL: @test_vext_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i32> [[VEXT]] +// CHECK-LABEL: define <2 x i32> @test_vext_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VEXT]] +// uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) { return vext_u32(a, b, 1); } -// CHECK-LABEL: @test_vext_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define <1 x i64> @test_vext_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) { return vext_s64(a, b, 0); } -// CHECK-LABEL: @test_vext_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[VEXT]] +// CHECK-LABEL: define <1 x i64> @test_vext_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) { return vext_u64(a, b, 0); } -// CHECK-LABEL: @test_vext_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> -// CHECK: ret <2 x float> [[VEXT]] +// CHECK-LABEL: define <2 x float> @test_vext_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x float> [[VEXT]] +// float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) { return vext_f32(a, b, 1); } -// CHECK-LABEL: @test_vextq_s8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) { return vextq_s8(a, b, 15); } -// CHECK-LABEL: @test_vextq_u8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) { return vextq_u8(a, b, 15); } -// CHECK-LABEL: @test_vextq_p8( -// CHECK: [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: ret <16 x i8> [[VEXT]] +// CHECK-LABEL: define <16 x i8> @test_vextq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) { return vextq_p8(a, b, 15); } -// CHECK-LABEL: @test_vextq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) { return vextq_s16(a, b, 7); } -// CHECK-LABEL: @test_vextq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) { return vextq_u16(a, b, 7); } -// CHECK-LABEL: @test_vextq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> -// CHECK: ret <8 x i16> [[VEXT]] +// CHECK-LABEL: define <8 x i16> @test_vextq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[VEXT]] +// poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) { return vextq_p16(a, b, 7); } -// CHECK-LABEL: @test_vextq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define <4 x i32> @test_vextq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) { return vextq_s32(a, b, 3); } -// CHECK-LABEL: @test_vextq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> -// CHECK: ret <4 x i32> [[VEXT]] +// CHECK-LABEL: define <4 x i32> @test_vextq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VEXT]] +// uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) { return vextq_u32(a, b, 3); } -// CHECK-LABEL: @test_vextq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define <2 x i64> @test_vextq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) { return vextq_s64(a, b, 1); } -// CHECK-LABEL: @test_vextq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> -// CHECK: ret <2 x i64> [[VEXT]] +// CHECK-LABEL: define <2 x i64> @test_vextq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VEXT]] +// uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) { return vextq_u64(a, b, 1); } -// CHECK-LABEL: @test_vextq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> -// CHECK: ret <4 x float> [[VEXT]] +// CHECK-LABEL: define <4 x float> @test_vextq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: ret <4 x float> [[VEXT]] +// float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) { return vextq_f32(a, b, 3); } -// CHECK-LABEL: @test_vfma_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %c, <2 x float> %a) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vfma_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[C]], <2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vfma_f32(a, b, c); } -// CHECK-LABEL: @test_vfmaq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %c, <4 x float> %a) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vfmaq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[C]], <4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmaq_f32(a, b, c); } -// CHECK-LABEL: @test_vfms_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %b -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8> -// CHECK: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[SUB_I]], <2 x float> %c, <2 x float> %a) -// CHECK: ret <2 x float> [[TMP3]] +// CHECK-LABEL: define <2 x float> @test_vfms_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[C]], <2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vfms_f32(a, b, c); } -// CHECK-LABEL: @test_vfmsq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %b -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8> -// CHECK: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[SUB_I]], <4 x float> %c, <4 x float> %a) -// CHECK: ret <4 x float> [[TMP3]] +// CHECK-LABEL: define <4 x float> @test_vfmsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[B]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[C]], <4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vfmsq_f32(a, b, c); } -// CHECK-LABEL: @test_vget_high_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vget_high_s8(int8x16_t a) { return vget_high_s8(a); } -// CHECK-LABEL: @test_vget_high_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vget_high_s16(int16x8_t a) { return vget_high_s16(a); } -// CHECK-LABEL: @test_vget_high_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_high_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vget_high_s32(int32x4_t a) { return vget_high_s32(a); } -// CHECK-LABEL: @test_vget_high_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_high_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// int64x1_t test_vget_high_s64(int64x2_t a) { return vget_high_s64(a); } -// CHECK-LABEL: @test_vget_high_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x half> @test_vget_high_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: ret <4 x half> [[SHUFFLE_I]] +// float16x4_t test_vget_high_f16(float16x8_t a) { return vget_high_f16(a); } -// CHECK-LABEL: @test_vget_high_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vget_high_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vget_high_f32(float32x4_t a) { return vget_high_f32(a); } -// CHECK-LABEL: @test_vget_high_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vget_high_u8(uint8x16_t a) { return vget_high_u8(a); } -// CHECK-LABEL: @test_vget_high_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vget_high_u16(uint16x8_t a) { return vget_high_u16(a); } -// CHECK-LABEL: @test_vget_high_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_high_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vget_high_u32(uint32x4_t a) { return vget_high_u32(a); } -// CHECK-LABEL: @test_vget_high_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_high_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// uint64x1_t test_vget_high_u64(uint64x2_t a) { return vget_high_u64(a); } -// CHECK-LABEL: @test_vget_high_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_high_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vget_high_p8(poly8x16_t a) { return vget_high_p8(a); } -// CHECK-LABEL: @test_vget_high_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_high_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vget_high_p16(poly16x8_t a) { return vget_high_p16(a); } -// CHECK-LABEL: @test_vget_lane_u8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// uint8_t test_vget_lane_u8(uint8x8_t a) { return vget_lane_u8(a, 7); } -// CHECK-LABEL: @test_vget_lane_u16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// uint16_t test_vget_lane_u16(uint16x4_t a) { return vget_lane_u16(a, 3); } -// CHECK-LABEL: @test_vget_lane_u32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vget_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// uint32_t test_vget_lane_u32(uint32x2_t a) { return vget_lane_u32(a, 1); } -// CHECK-LABEL: @test_vget_lane_s8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vget_lane_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// int8_t test_vget_lane_s8(int8x8_t a) { return vget_lane_s8(a, 7); } -// CHECK-LABEL: @test_vget_lane_s16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vget_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// int16_t test_vget_lane_s16(int16x4_t a) { return vget_lane_s16(a, 3); } -// CHECK-LABEL: @test_vget_lane_s32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i32> %a, i32 1 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vget_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[A]], i64 1 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// int32_t test_vget_lane_s32(int32x2_t a) { return vget_lane_s32(a, 1); } -// CHECK-LABEL: @test_vget_lane_p8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vget_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i64 7 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// poly8_t test_vget_lane_p8(poly8x8_t a) { return vget_lane_p8(a, 7); } -// CHECK-LABEL: @test_vget_lane_p16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> %a, i32 3 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vget_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[A]], i64 3 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// poly16_t test_vget_lane_p16(poly16x4_t a) { return vget_lane_p16(a, 3); } -// CHECK-LABEL: @test_vget_lane_f32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x float> %a, i32 1 -// CHECK: ret float [[VGET_LANE]] +// CHECK-LABEL: define float @test_vget_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x float> [[A]], i64 1 +// CHECK-NEXT: ret float [[VGET_LANE]] +// float32_t test_vget_lane_f32(float32x2_t a) { return vget_lane_f32(a, 1); } -// CHECK-LABEL: @test_vget_lane_f16( -// CHECK: [[__REINT_242:%.*]] = alloca <4 x half>, align 8 -// CHECK: [[__REINT1_242:%.*]] = alloca i16, align 2 -// CHECK: store <4 x half> %a, ptr [[__REINT_242]], align 8 -// CHECK: [[TMP1:%.*]] = load <4 x i16>, ptr [[__REINT_242]], align 8 -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 1 -// CHECK: store i16 [[VGET_LANE]], ptr [[__REINT1_242]], align 2 -// CHECK: [[TMP5:%.*]] = load half, ptr [[__REINT1_242]], align 2 -// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float -// CHECK: ret float [[CONV]] +// CHECK-LABEL: define float @test_vget_lane_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[A]], i64 1 +// CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP0]] to float +// CHECK-NEXT: ret float [[CONV]] +// float32_t test_vget_lane_f16(float16x4_t a) { return vget_lane_f16(a, 1); } -// CHECK-LABEL: @test_vgetq_lane_u8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// uint8_t test_vgetq_lane_u8(uint8x16_t a) { return vgetq_lane_u8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_u16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// uint16_t test_vgetq_lane_u16(uint16x8_t a) { return vgetq_lane_u16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_u32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vgetq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// uint32_t test_vgetq_lane_u32(uint32x4_t a) { return vgetq_lane_u32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_s8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// int8_t test_vgetq_lane_s8(int8x16_t a) { return vgetq_lane_s8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_s16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// int16_t test_vgetq_lane_s16(int16x8_t a) { return vgetq_lane_s16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_s32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x i32> %a, i32 3 -// CHECK: ret i32 [[VGET_LANE]] +// CHECK-LABEL: define i32 @test_vgetq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i32> [[A]], i64 3 +// CHECK-NEXT: ret i32 [[VGET_LANE]] +// int32_t test_vgetq_lane_s32(int32x4_t a) { return vgetq_lane_s32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_p8( -// CHECK: [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15 -// CHECK: ret i8 [[VGET_LANE]] +// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i64 15 +// CHECK-NEXT: ret i8 [[VGET_LANE]] +// poly8_t test_vgetq_lane_p8(poly8x16_t a) { return vgetq_lane_p8(a, 15); } -// CHECK-LABEL: @test_vgetq_lane_p16( -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> %a, i32 7 -// CHECK: ret i16 [[VGET_LANE]] +// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[A]], i64 7 +// CHECK-NEXT: ret i16 [[VGET_LANE]] +// poly16_t test_vgetq_lane_p16(poly16x8_t a) { return vgetq_lane_p16(a, 7); } -// CHECK-LABEL: @test_vgetq_lane_f32( -// CHECK: [[VGET_LANE:%.*]] = extractelement <4 x float> %a, i32 3 -// CHECK: ret float [[VGET_LANE]] +// CHECK-LABEL: define float @test_vgetq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x float> [[A]], i64 3 +// CHECK-NEXT: ret float [[VGET_LANE]] +// float32_t test_vgetq_lane_f32(float32x4_t a) { return vgetq_lane_f32(a, 3); } -// CHECK-LABEL: @test_vgetq_lane_f16( -// CHECK: [[__REINT_244:%.*]] = alloca <8 x half>, align 16 -// CHECK: [[__REINT1_244:%.*]] = alloca i16, align 2 -// CHECK: store <8 x half> %a, ptr [[__REINT_244]], align 16 -// CHECK: [[TMP1:%.*]] = load <8 x i16>, ptr [[__REINT_244]], align 16 -// CHECK: [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 -// CHECK: store i16 [[VGET_LANE]], ptr [[__REINT1_244]], align 2 -// CHECK: [[TMP5:%.*]] = load half, ptr [[__REINT1_244]], align 2 -// CHECK: [[CONV:%.*]] = fpext half [[TMP5]] to float -// CHECK: ret float [[CONV]] +// CHECK-LABEL: define float @test_vgetq_lane_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[A]], i64 3 +// CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP0]] to float +// CHECK-NEXT: ret float [[CONV]] +// float32_t test_vgetq_lane_f16(float16x8_t a) { return vgetq_lane_f16(a, 3); } -// CHECK-LABEL: @test_vget_lane_s64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vget_lane_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i64 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// int64_t test_vget_lane_s64(int64x1_t a) { return vget_lane_s64(a, 0); } -// CHECK-LABEL: @test_vget_lane_u64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <1 x i64> %a, i32 0 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vget_lane_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[A]], i64 0 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// uint64_t test_vget_lane_u64(uint64x1_t a) { return vget_lane_u64(a, 0); } -// CHECK-LABEL: @test_vgetq_lane_s64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vgetq_lane_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// int64_t test_vgetq_lane_s64(int64x2_t a) { return vgetq_lane_s64(a, 1); } -// CHECK-LABEL: @test_vgetq_lane_u64( -// CHECK: [[VGET_LANE:%.*]] = extractelement <2 x i64> %a, i32 1 -// CHECK: ret i64 [[VGET_LANE]] +// CHECK-LABEL: define i64 @test_vgetq_lane_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i64> [[A]], i64 1 +// CHECK-NEXT: ret i64 [[VGET_LANE]] +// uint64_t test_vgetq_lane_u64(uint64x2_t a) { return vgetq_lane_u64(a, 1); } -// CHECK-LABEL: @test_vget_low_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vget_low_s8(int8x16_t a) { return vget_low_s8(a); } -// CHECK-LABEL: @test_vget_low_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vget_low_s16(int16x8_t a) { return vget_low_s16(a); } -// CHECK-LABEL: @test_vget_low_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_low_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vget_low_s32(int32x4_t a) { return vget_low_s32(a); } -// CHECK-LABEL: @test_vget_low_s64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_low_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// int64x1_t test_vget_low_s64(int64x2_t a) { return vget_low_s64(a); } -// CHECK-LABEL: @test_vget_low_f16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x half> @test_vget_low_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> poison, <4 x i32> +// CHECK-NEXT: ret <4 x half> [[SHUFFLE_I]] +// float16x4_t test_vget_low_f16(float16x8_t a) { return vget_low_f16(a); } -// CHECK-LABEL: @test_vget_low_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vget_low_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vget_low_f32(float32x4_t a) { return vget_low_f32(a); } -// CHECK-LABEL: @test_vget_low_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vget_low_u8(uint8x16_t a) { return vget_low_u8(a); } -// CHECK-LABEL: @test_vget_low_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vget_low_u16(uint16x8_t a) { return vget_low_u16(a); } -// CHECK-LABEL: @test_vget_low_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vget_low_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vget_low_u32(uint32x4_t a) { return vget_low_u32(a); } -// CHECK-LABEL: @test_vget_low_u64( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE_I]] +// CHECK-LABEL: define <1 x i64> @test_vget_low_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> poison, <1 x i32> zeroinitializer +// CHECK-NEXT: ret <1 x i64> [[SHUFFLE_I]] +// uint64x1_t test_vget_low_u64(uint64x2_t a) { return vget_low_u64(a); } -// CHECK-LABEL: @test_vget_low_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vget_low_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vget_low_p8(poly8x16_t a) { return vget_low_p8(a); } -// CHECK-LABEL: @test_vget_low_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vget_low_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vget_low_p16(poly16x8_t a) { return vget_low_p16(a); } -// CHECK-LABEL: @test_vhadd_s8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) { return vhadd_s8(a, b); } -// CHECK-LABEL: @test_vhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VHADD_V2_I]] +// int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) { return vhadd_s16(a, b); } -// CHECK-LABEL: @test_vhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VHADD_V2_I]] +// int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) { return vhadd_s32(a, b); } -// CHECK-LABEL: @test_vhadd_u8( -// CHECK: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHADD_V_I]] +// uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) { return vhadd_u8(a, b); } -// CHECK-LABEL: @test_vhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VHADD_V2_I]] +// uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) { return vhadd_u16(a, b); } -// CHECK-LABEL: @test_vhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VHADD_V2_I]] +// uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) { return vhadd_u32(a, b); } -// CHECK-LABEL: @test_vhaddq_s8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) { return vhaddq_s8(a, b); } -// CHECK-LABEL: @test_vhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VHADDQ_V2_I]] +// int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) { return vhaddq_s16(a, b); } -// CHECK-LABEL: @test_vhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VHADDQ_V2_I]] +// int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) { return vhaddq_s32(a, b); } -// CHECK-LABEL: @test_vhaddq_u8( -// CHECK: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHADDQ_V_I]] +// uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) { return vhaddq_u8(a, b); } -// CHECK-LABEL: @test_vhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VHADDQ_V2_I]] +// uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) { return vhaddq_u16(a, b); } -// CHECK-LABEL: @test_vhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VHADDQ_V2_I]] +// uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) { return vhaddq_u32(a, b); } -// CHECK-LABEL: @test_vhsub_s8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) { return vhsub_s8(a, b); } -// CHECK-LABEL: @test_vhsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VHSUB_V2_I]] +// int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) { return vhsub_s16(a, b); } -// CHECK-LABEL: @test_vhsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VHSUB_V2_I]] +// int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) { return vhsub_s32(a, b); } -// CHECK-LABEL: @test_vhsub_u8( -// CHECK: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VHSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vhsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VHSUB_V_I]] +// uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) { return vhsub_u8(a, b); } -// CHECK-LABEL: @test_vhsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VHSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vhsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VHSUB_V2_I]] +// uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) { return vhsub_u16(a, b); } -// CHECK-LABEL: @test_vhsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VHSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vhsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VHSUB_V2_I]] +// uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) { return vhsub_u32(a, b); } -// CHECK-LABEL: @test_vhsubq_s8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) { return vhsubq_s8(a, b); } -// CHECK-LABEL: @test_vhsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VHSUBQ_V2_I]] +// int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) { return vhsubq_s16(a, b); } -// CHECK-LABEL: @test_vhsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VHSUBQ_V2_I]] +// int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) { return vhsubq_s32(a, b); } -// CHECK-LABEL: @test_vhsubq_u8( -// CHECK: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VHSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VHSUBQ_V_I]] +// uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) { return vhsubq_u8(a, b); } -// CHECK-LABEL: @test_vhsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VHSUBQ_V2_I]] +// uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) { return vhsubq_u16(a, b); } -// CHECK-LABEL: @test_vhsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VHSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VHSUBQ_V2_I]] +// uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) { return vhsubq_u32(a, b); } -// CHECK-LABEL: @test_vld1q_u8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// uint8x16_t test_vld1q_u8(uint8_t const * a) { return vld1q_u8(a); } -// CHECK-LABEL: @test_vld1q_u16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// uint16x8_t test_vld1q_u16(uint16_t const * a) { return vld1q_u16(a); } -// CHECK-LABEL: @test_vld1q_u32( -// CHECK: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4) -// CHECK: ret <4 x i32> [[VLD1]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[VLD1]] +// uint32x4_t test_vld1q_u32(uint32_t const * a) { return vld1q_u32(a); } -// CHECK-LABEL: @test_vld1q_u64( -// CHECK: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4) -// CHECK: ret <2 x i64> [[VLD1]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <2 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i64> [[VLD1]] +// uint64x2_t test_vld1q_u64(uint64_t const * a) { return vld1q_u64(a); } -// CHECK-LABEL: @test_vld1q_s8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// int8x16_t test_vld1q_s8(int8_t const * a) { return vld1q_s8(a); } -// CHECK-LABEL: @test_vld1q_s16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// int16x8_t test_vld1q_s16(int16_t const * a) { return vld1q_s16(a); } -// CHECK-LABEL: @test_vld1q_s32( -// CHECK: [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0(ptr %a, i32 4) -// CHECK: ret <4 x i32> [[VLD1]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x i32> [[VLD1]] +// int32x4_t test_vld1q_s32(int32_t const * a) { return vld1q_s32(a); } -// CHECK-LABEL: @test_vld1q_s64( -// CHECK: [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0(ptr %a, i32 4) -// CHECK: ret <2 x i64> [[VLD1]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <2 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i64> [[VLD1]] +// int64x2_t test_vld1q_s64(int64_t const * a) { return vld1q_s64(a); } -// CHECK-LABEL: @test_vld1q_f16( -// CHECK: [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0(ptr %a, i32 2) -// CHECK: ret <8 x half> [[VLD1]] +// CHECK-LABEL: define <8 x half> @test_vld1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x half> [[VLD1]] +// float16x8_t test_vld1q_f16(float16_t const * a) { return vld1q_f16(a); } -// CHECK-LABEL: @test_vld1q_f32( -// CHECK: [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0(ptr %a, i32 4) -// CHECK: ret <4 x float> [[VLD1]] +// CHECK-LABEL: define <4 x float> @test_vld1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <4 x float> [[VLD1]] +// float32x4_t test_vld1q_f32(float32_t const * a) { return vld1q_f32(a); } -// CHECK-LABEL: @test_vld1q_p8( -// CHECK: [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0(ptr %a, i32 1) -// CHECK: ret <16 x i8> [[VLD1]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <16 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <16 x i8> [[VLD1]] +// poly8x16_t test_vld1q_p8(poly8_t const * a) { return vld1q_p8(a); } -// CHECK-LABEL: @test_vld1q_p16( -// CHECK: [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0(ptr %a, i32 2) -// CHECK: ret <8 x i16> [[VLD1]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <8 x i16> [[VLD1]] +// poly16x8_t test_vld1q_p16(poly16_t const * a) { return vld1q_p16(a); } -// CHECK-LABEL: @test_vld1_u8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// uint8x8_t test_vld1_u8(uint8_t const * a) { return vld1_u8(a); } -// CHECK-LABEL: @test_vld1_u16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// uint16x4_t test_vld1_u16(uint16_t const * a) { return vld1_u16(a); } -// CHECK-LABEL: @test_vld1_u32( -// CHECK: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4) -// CHECK: ret <2 x i32> [[VLD1]] +// CHECK-LABEL: define <2 x i32> @test_vld1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[VLD1]] +// uint32x2_t test_vld1_u32(uint32_t const * a) { return vld1_u32(a); } -// CHECK-LABEL: @test_vld1_u64( -// CHECK: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: ret <1 x i64> [[VLD1]] +// CHECK-LABEL: define <1 x i64> @test_vld1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <1 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: ret <1 x i64> [[VLD1]] +// uint64x1_t test_vld1_u64(uint64_t const * a) { return vld1_u64(a); } -// CHECK-LABEL: @test_vld1_s8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// int8x8_t test_vld1_s8(int8_t const * a) { return vld1_s8(a); } -// CHECK-LABEL: @test_vld1_s16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// int16x4_t test_vld1_s16(int16_t const * a) { return vld1_s16(a); } -// CHECK-LABEL: @test_vld1_s32( -// CHECK: [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0(ptr %a, i32 4) -// CHECK: ret <2 x i32> [[VLD1]] +// CHECK-LABEL: define <2 x i32> @test_vld1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <2 x i32>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x i32> [[VLD1]] +// int32x2_t test_vld1_s32(int32_t const * a) { return vld1_s32(a); } -// CHECK-LABEL: @test_vld1_s64( -// CHECK: [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: ret <1 x i64> [[VLD1]] +// CHECK-LABEL: define <1 x i64> @test_vld1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <1 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: ret <1 x i64> [[VLD1]] +// int64x1_t test_vld1_s64(int64_t const * a) { return vld1_s64(a); } -// CHECK-LABEL: @test_vld1_f16( -// CHECK: [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0(ptr %a, i32 2) -// CHECK: ret <4 x half> [[VLD1]] +// CHECK-LABEL: define <4 x half> @test_vld1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x half>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x half> [[VLD1]] +// float16x4_t test_vld1_f16(float16_t const * a) { return vld1_f16(a); } -// CHECK-LABEL: @test_vld1_f32( -// CHECK: [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0(ptr %a, i32 4) -// CHECK: ret <2 x float> [[VLD1]] +// CHECK-LABEL: define <2 x float> @test_vld1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <2 x float>, ptr [[A]], align 4 +// CHECK-NEXT: ret <2 x float> [[VLD1]] +// float32x2_t test_vld1_f32(float32_t const * a) { return vld1_f32(a); } -// CHECK-LABEL: @test_vld1_p8( -// CHECK: [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %a, i32 1) -// CHECK: ret <8 x i8> [[VLD1]] +// CHECK-LABEL: define <8 x i8> @test_vld1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <8 x i8>, ptr [[A]], align 1 +// CHECK-NEXT: ret <8 x i8> [[VLD1]] +// poly8x8_t test_vld1_p8(poly8_t const * a) { return vld1_p8(a); } -// CHECK-LABEL: @test_vld1_p16( -// CHECK: [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0(ptr %a, i32 2) -// CHECK: ret <4 x i16> [[VLD1]] +// CHECK-LABEL: define <4 x i16> @test_vld1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD1:%.*]] = load <4 x i16>, ptr [[A]], align 2 +// CHECK-NEXT: ret <4 x i16> [[VLD1]] +// poly16x4_t test_vld1_p16(poly16_t const * a) { return vld1_p16(a); } -// CHECK-LABEL: @test_vld1q_dup_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// uint8x16_t test_vld1q_dup_u8(uint8_t const * a) { return vld1q_dup_u8(a); } -// CHECK-LABEL: @test_vld1q_dup_u16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// uint16x8_t test_vld1q_dup_u16(uint16_t const * a) { return vld1q_dup_u16(a); } -// CHECK-LABEL: @test_vld1q_dup_u32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// uint32x4_t test_vld1q_dup_u32(uint32_t const * a) { return vld1q_dup_u32(a); } -// CHECK-LABEL: @test_vld1q_dup_u64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// uint64x2_t test_vld1q_dup_u64(uint64_t const * a) { return vld1q_dup_u64(a); } -// CHECK-LABEL: @test_vld1q_dup_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// int8x16_t test_vld1q_dup_s8(int8_t const * a) { return vld1q_dup_s8(a); } -// CHECK-LABEL: @test_vld1q_dup_s16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// int16x8_t test_vld1q_dup_s16(int16_t const * a) { return vld1q_dup_s16(a); } -// CHECK-LABEL: @test_vld1q_dup_s32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i32> [[LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[LANE]] +// int32x4_t test_vld1q_dup_s32(int32_t const * a) { return vld1q_dup_s32(a); } -// CHECK-LABEL: @test_vld1q_dup_s64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[LANE]] +// int64x2_t test_vld1q_dup_s64(int64_t const * a) { return vld1q_dup_s64(a); } -// CHECK-LABEL: @test_vld1q_dup_f16( -// CHECK: [[TMP2:%.*]] = load half, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x half> [[LANE]] +// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[LANE]] +// float16x8_t test_vld1q_dup_f16(float16_t const * a) { return vld1q_dup_f16(a); } -// CHECK-LABEL: @test_vld1q_dup_f32( -// CHECK: [[TMP2:%.*]] = load float, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x float> [[LANE]] +// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[LANE]] +// float32x4_t test_vld1q_dup_f32(float32_t const * a) { return vld1q_dup_f32(a); } -// CHECK-LABEL: @test_vld1q_dup_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer -// CHECK: ret <16 x i8> [[LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// poly8x16_t test_vld1q_dup_p8(poly8_t const * a) { return vld1q_dup_p8(a); } -// CHECK-LABEL: @test_vld1q_dup_p16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i16> [[LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[LANE]] +// poly16x8_t test_vld1q_dup_p16(poly16_t const * a) { return vld1q_dup_p16(a); } -// CHECK-LABEL: @test_vld1_dup_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// uint8x8_t test_vld1_dup_u8(uint8_t const * a) { return vld1_dup_u8(a); } -// CHECK-LABEL: @test_vld1_dup_u16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// uint16x4_t test_vld1_dup_u16(uint16_t const * a) { return vld1_dup_u16(a); } -// CHECK-LABEL: @test_vld1_dup_u32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// uint32x2_t test_vld1_dup_u32(uint32_t const * a) { return vld1_dup_u32(a); } -// CHECK-LABEL: @test_vld1_dup_u64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP1]] +// uint64x1_t test_vld1_dup_u64(uint64_t const * a) { return vld1_dup_u64(a); } -// CHECK-LABEL: @test_vld1_dup_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// int8x8_t test_vld1_dup_s8(int8_t const * a) { return vld1_dup_s8(a); } -// CHECK-LABEL: @test_vld1_dup_s16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// int16x4_t test_vld1_dup_s16(int16_t const * a) { return vld1_dup_s16(a); } -// CHECK-LABEL: @test_vld1_dup_s32( -// CHECK: [[TMP2:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x i32> [[LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[LANE]] +// int32x2_t test_vld1_dup_s32(int32_t const * a) { return vld1_dup_s32(a); } -// CHECK-LABEL: @test_vld1_dup_s64( -// CHECK: [[TMP2:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[TMP1]] +// int64x1_t test_vld1_dup_s64(int64_t const * a) { return vld1_dup_s64(a); } -// CHECK-LABEL: @test_vld1_dup_f16( -// CHECK: [[TMP2:%.*]] = load half, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x half> poison, half [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x half> [[LANE]] +// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[LANE]] +// float16x4_t test_vld1_dup_f16(float16_t const * a) { return vld1_dup_f16(a); } -// CHECK-LABEL: @test_vld1_dup_f32( -// CHECK: [[TMP2:%.*]] = load float, ptr %a, align 4 -// CHECK: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer -// CHECK: ret <2 x float> [[LANE]] +// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[LANE]] +// float32x2_t test_vld1_dup_f32(float32_t const * a) { return vld1_dup_f32(a); } -// CHECK-LABEL: @test_vld1_dup_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer -// CHECK: ret <8 x i8> [[LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i8> poison, i8 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// poly8x8_t test_vld1_dup_p8(poly8_t const * a) { return vld1_dup_p8(a); } -// CHECK-LABEL: @test_vld1_dup_p16( -// CHECK: [[TMP2:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[TMP3:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0 -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer -// CHECK: ret <4 x i16> [[LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[LANE]] +// poly16x4_t test_vld1_dup_p16(poly16_t const * a) { return vld1_dup_p16(a); } -// CHECK-LABEL: @test_vld1q_lane_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) { return vld1q_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) { return vld1q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) { return vld1q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> -// CHECK: ret <2 x i64> [[VLD1Q_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VLD1Q_LANE]] +// uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) { return vld1q_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vld1q_lane_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) { return vld1q_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) { return vld1q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3 -// CHECK: ret <4 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VLD1_LANE]] +// int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) { return vld1q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %a, i32 4) -// CHECK: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> -// CHECK: ret <2 x i64> [[VLD1Q_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> zeroinitializer +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <2 x i32> +// CHECK-NEXT: ret <2 x i64> [[VLD1Q_LANE]] +// int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) { return vld1q_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vld1q_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7 -// CHECK: ret <8 x half> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x half> [[B]], half [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x half> [[VLD1_LANE]] +// float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) { return vld1q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld1q_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3 -// CHECK: ret <4 x float> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x float> [[B]], float [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x float> [[VLD1_LANE]] +// float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) { return vld1q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld1q_lane_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15 -// CHECK: ret <16 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VLD1_LANE]] +// poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) { return vld1q_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vld1q_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7 -// CHECK: ret <8 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VLD1_LANE]] +// poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) { return vld1q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_u8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) { return vld1_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) { return vld1_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) { return vld1_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) { return vld1_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vld1_lane_s8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) { return vld1_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) { return vld1_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = load i32, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1 -// CHECK: ret <2 x i32> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VLD1_LANE]] +// int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) { return vld1_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = load i64, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0 -// CHECK: ret <1 x i64> [[VLD1_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[TMP0]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VLD1_LANE]] +// int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) { return vld1_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vld1_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP4:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3 -// CHECK: ret <4 x half> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x half> [[B]], half [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x half> [[VLD1_LANE]] +// float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) { return vld1_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld1_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP4:%.*]] = load float, ptr %a, align 4 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1 -// CHECK: ret <2 x float> [[VLD1_LANE]] +// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A]], align 4 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <2 x float> [[B]], float [[TMP0]], i64 1 +// CHECK-NEXT: ret <2 x float> [[VLD1_LANE]] +// float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) { return vld1_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld1_lane_p8( -// CHECK: [[TMP0:%.*]] = load i8, ptr %a, align 1 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7 -// CHECK: ret <8 x i8> [[VLD1_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VLD1_LANE]] +// poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) { return vld1_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld1_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = load i16, ptr %a, align 2 -// CHECK: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3 -// CHECK: ret <4 x i16> [[VLD1_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VLD1_LANE]] +// poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) { return vld1_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld2q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x2_t test_vld2q_u8(uint8_t const * a) { return vld2q_u8(a); } -// CHECK-LABEL: @test_vld2q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x2_t test_vld2q_u16(uint16_t const * a) { return vld2q_u16(a); } -// CHECK-LABEL: @test_vld2q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x2_t test_vld2q_u32(uint32_t const * a) { return vld2q_u32(a); } -// CHECK-LABEL: @test_vld2q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x2_t test_vld2q_s8(int8_t const * a) { return vld2q_s8(a); } -// CHECK-LABEL: @test_vld2q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x2_t test_vld2q_s16(int16_t const * a) { return vld2q_s16(a); } -// CHECK-LABEL: @test_vld2q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x2_t test_vld2q_s32(int32_t const * a) { return vld2q_s32(a); } -// CHECK-LABEL: @test_vld2q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld2q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x2_t test_vld2q_f16(float16_t const * a) { return vld2q_f16(a); } -// CHECK-LABEL: @test_vld2q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld2q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x2_t test_vld2q_f32(float32_t const * a) { return vld2q_f32(a); } -// CHECK-LABEL: @test_vld2q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld2q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x2_t test_vld2q_p8(poly8_t const * a) { return vld2q_p8(a); } -// CHECK-LABEL: @test_vld2q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 0 +// CHECK-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x2_t test_vld2q_p16(poly16_t const * a) { return vld2q_p16(a); } -// CHECK-LABEL: @test_vld2_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x2_t test_vld2_u8(uint8_t const * a) { return vld2_u8(a); } -// CHECK-LABEL: @test_vld2_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x2_t test_vld2_u16(uint16_t const * a) { return vld2_u16(a); } -// CHECK-LABEL: @test_vld2_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x2_t test_vld2_u32(uint32_t const * a) { return vld2_u32(a); } -// CHECK-LABEL: @test_vld2_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld2_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x2_t test_vld2_u64(uint64_t const * a) { return vld2_u64(a); } -// CHECK-LABEL: @test_vld2_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x2_t test_vld2_s8(int8_t const * a) { return vld2_s8(a); } -// CHECK-LABEL: @test_vld2_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x2_t test_vld2_s16(int16_t const * a) { return vld2_s16(a); } -// CHECK-LABEL: @test_vld2_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x2_t test_vld2_s32(int32_t const * a) { return vld2_s32(a); } -// CHECK-LABEL: @test_vld2_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld2_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64> } [[VLD2_V]], 1 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x2_t test_vld2_s64(int64_t const * a) { return vld2_s64(a); } -// CHECK-LABEL: @test_vld2_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld2_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x2_t test_vld2_f16(float16_t const * a) { return vld2_f16(a); } -// CHECK-LABEL: @test_vld2_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld2_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_V]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x2_t test_vld2_f32(float32_t const * a) { return vld2_f32(a); } -// CHECK-LABEL: @test_vld2_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x2_t test_vld2_p8(poly8_t const * a) { return vld2_p8(a); } -// CHECK-LABEL: @test_vld2_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 0 +// CHECK-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x2_t test_vld2_p16(poly16_t const * a) { return vld2_p16(a); } -// CHECK-LABEL: @test_vld2q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) { return vld2q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) { return vld2q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) { return vld2q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld2q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) { return vld2q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld2q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half> } @llvm.arm.neon.vld2lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x half> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) { return vld2q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld2q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld2q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <4 x float> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) { return vld2q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld2q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld2q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], 1 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD2Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) { return vld2q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) { return vld2_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) { return vld2_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) { return vld2_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) { return vld2_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) { return vld2_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld2_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) { return vld2_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld2_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half> } @llvm.arm.neon.vld2lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) { return vld2_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld2_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld2_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) { return vld2_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld2_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld2_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) { return vld2_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld2_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld2_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 0 +// CHECK-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], 1 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD2_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) { return vld2_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld3q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x3_t test_vld3q_u8(uint8_t const * a) { return vld3q_u8(a); } -// CHECK-LABEL: @test_vld3q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x3_t test_vld3q_u16(uint16_t const * a) { return vld3q_u16(a); } -// CHECK-LABEL: @test_vld3q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x3_t test_vld3q_u32(uint32_t const * a) { return vld3q_u32(a); } -// CHECK-LABEL: @test_vld3q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x3_t test_vld3q_s8(int8_t const * a) { return vld3q_s8(a); } -// CHECK-LABEL: @test_vld3q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x3_t test_vld3q_s16(int16_t const * a) { return vld3q_s16(a); } -// CHECK-LABEL: @test_vld3q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x3_t test_vld3q_s32(int32_t const * a) { return vld3q_s32(a); } -// CHECK-LABEL: @test_vld3q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld3q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x3_t test_vld3q_f16(float16_t const * a) { return vld3q_f16(a); } -// CHECK-LABEL: @test_vld3q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld3q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x3_t test_vld3q_f32(float32_t const * a) { return vld3q_f32(a); } -// CHECK-LABEL: @test_vld3q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld3q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x3_t test_vld3q_p8(poly8_t const * a) { return vld3q_p8(a); } -// CHECK-LABEL: @test_vld3q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 0 +// CHECK-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 1 +// CHECK-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x3_t test_vld3q_p16(poly16_t const * a) { return vld3q_p16(a); } -// CHECK-LABEL: @test_vld3_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x3_t test_vld3_u8(uint8_t const * a) { return vld3_u8(a); } -// CHECK-LABEL: @test_vld3_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x3_t test_vld3_u16(uint16_t const * a) { return vld3_u16(a); } -// CHECK-LABEL: @test_vld3_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x3_t test_vld3_u32(uint32_t const * a) { return vld3_u32(a); } -// CHECK-LABEL: @test_vld3_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld3_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x3_t test_vld3_u64(uint64_t const * a) { return vld3_u64(a); } -// CHECK-LABEL: @test_vld3_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x3_t test_vld3_s8(int8_t const * a) { return vld3_s8(a); } -// CHECK-LABEL: @test_vld3_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x3_t test_vld3_s16(int16_t const * a) { return vld3_s16(a); } -// CHECK-LABEL: @test_vld3_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x3_t test_vld3_s32(int32_t const * a) { return vld3_s32(a); } -// CHECK-LABEL: @test_vld3_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld3_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], 2 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x3_t test_vld3_s64(int64_t const * a) { return vld3_s64(a); } -// CHECK-LABEL: @test_vld3_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld3_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x3_t test_vld3_f16(float16_t const * a) { return vld3_f16(a); } -// CHECK-LABEL: @test_vld3_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld3_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x3_t test_vld3_f32(float32_t const * a) { return vld3_f32(a); } -// CHECK-LABEL: @test_vld3_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x3_t test_vld3_p8(poly8_t const * a) { return vld3_p8(a); } -// CHECK-LABEL: @test_vld3_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 0 +// CHECK-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 1 +// CHECK-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x3_t test_vld3_p16(poly16_t const * a) { return vld3_p16(a); } -// CHECK-LABEL: @test_vld3q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) { return vld3q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) { return vld3q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) { return vld3q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld3q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) { return vld3q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld3q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld3lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) { return vld3q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld3q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld3q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) { return vld3q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld3q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld3q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X3_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], 2 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD3Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) { return vld3q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) { return vld3_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) { return vld3_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) { return vld3_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) { return vld3_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) { return vld3_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld3_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) { return vld3_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld3_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld3lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) { return vld3_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld3_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld3_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) { return vld3_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld3_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld3_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) { return vld3_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld3_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld3_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X3_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 0 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 1 +// CHECK-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], 2 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD3_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) { return vld3_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vld4q_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint8x16x4_t test_vld4q_u8(uint8_t const * a) { return vld4q_u8(a); } -// CHECK-LABEL: @test_vld4q_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x4_t test_vld4q_u16(uint16_t const * a) { return vld4q_u16(a); } -// CHECK-LABEL: @test_vld4q_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x4_t test_vld4q_u32(uint32_t const * a) { return vld4q_u32(a); } -// CHECK-LABEL: @test_vld4q_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int8x16x4_t test_vld4q_s8(int8_t const * a) { return vld4q_s8(a); } -// CHECK-LABEL: @test_vld4q_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x4_t test_vld4q_s16(int16_t const * a) { return vld4q_s16(a); } -// CHECK-LABEL: @test_vld4q_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x4_t test_vld4q_s32(int32_t const * a) { return vld4q_s32(a); } -// CHECK-LABEL: @test_vld4q_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld4q_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4.v8f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x half> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x4_t test_vld4q_f16(float16_t const * a) { return vld4q_f16(a); } -// CHECK-LABEL: @test_vld4q_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld4q_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x float> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x4_t test_vld4q_f32(float32_t const * a) { return vld4q_f32(a); } -// CHECK-LABEL: @test_vld4q_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> +// CHECK-LABEL: define void @test_vld4q_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <16 x i8> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly8x16x4_t test_vld4q_p8(poly8_t const * a) { return vld4q_p8(a); } -// CHECK-LABEL: @test_vld4q_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 0 +// CHECK-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 1 +// CHECK-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 2 +// CHECK-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x4_t test_vld4q_p16(poly16_t const * a) { return vld4q_p16(a); } -// CHECK-LABEL: @test_vld4_u8( -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x4_t test_vld4_u8(uint8_t const * a) { return vld4_u8(a); } -// CHECK-LABEL: @test_vld4_u16( -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x4_t test_vld4_u16(uint16_t const * a) { return vld4_u16(a); } -// CHECK-LABEL: @test_vld4_u32( -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x4_t test_vld4_u32(uint32_t const * a) { return vld4_u32(a); } -// CHECK-LABEL: @test_vld4_u64( -// CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld4_u64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint64x1x4_t test_vld4_u64(uint64_t const * a) { return vld4_u64(a); } -// CHECK-LABEL: @test_vld4_s8( -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x4_t test_vld4_s8(int8_t const * a) { return vld4_s8(a); } -// CHECK-LABEL: @test_vld4_s16( -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x4_t test_vld4_s16(int16_t const * a) { return vld4_s16(a); } -// CHECK-LABEL: @test_vld4_s32( -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x4_t test_vld4_s32(int32_t const * a) { return vld4_s32(a); } -// CHECK-LABEL: @test_vld4_s64( -// CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> +// CHECK-LABEL: define void @test_vld4_s64( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT64X1X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], 3 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <1 x i64> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int64x1x4_t test_vld4_s64(int64_t const * a) { return vld4_s64(a); } -// CHECK-LABEL: @test_vld4_f16( -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld4_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4.v4f16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x half> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x4_t test_vld4_f16(float16_t const * a) { return vld4_f16(a); } -// CHECK-LABEL: @test_vld4_f32( -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld4_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0(ptr [[A]], i32 4) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x float> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x4_t test_vld4_f32(float32_t const * a) { return vld4_f32(a); } -// CHECK-LABEL: @test_vld4_p8( -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0(ptr [[A]], i32 1) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x4_t test_vld4_p8(poly8_t const * a) { return vld4_p8(a); } -// CHECK-LABEL: @test_vld4_p16( -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0(ptr [[A]], i32 2) +// CHECK-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 0 +// CHECK-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 1 +// CHECK-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 2 +// CHECK-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x4_t test_vld4_p16(poly16_t const * a) { return vld4_p16(a); } -// CHECK-LABEL: @test_vld4q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) { return vld4q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) { return vld4q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) { return vld4q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> +// CHECK-LABEL: define void @test_vld4q_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x i32> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) { return vld4q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> +// CHECK-LABEL: define void @test_vld4q_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.arm.neon.vld4lane.v8f16.p0(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x half> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) { return vld4q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vld4q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> +// CHECK-LABEL: define void @test_vld4q_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <4 x float> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) { return vld4q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vld4q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16> -// CHECK: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> +// CHECK-LABEL: define void @test_vld4q_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X4_T:%.*]]) align 16 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 0 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 1 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 2 +// CHECK-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], 3 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 16 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 32 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 48 +// CHECK-NEXT: store <8 x i16> [[VLD4Q_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 16 +// CHECK-NEXT: ret void +// poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) { return vld4q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) { return vld4_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) { return vld4_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_lane_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) { return vld4_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) { return vld4_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) { return vld4_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> +// CHECK-LABEL: define void @test_vld4_lane_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x i32> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) { return vld4_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> +// CHECK-LABEL: define void @test_vld4_lane_f16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.arm.neon.vld4lane.v4f16.p0(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x half> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) { return vld4_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vld4_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> +// CHECK-LABEL: define void @test_vld4_lane_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <2 x float> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) { return vld4_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vld4_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> +// CHECK-LABEL: define void @test_vld4_lane_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <8 x i8> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) { return vld4_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vld4_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP5:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP7:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP9:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP11:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16> -// CHECK: [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16> -// CHECK: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> +// CHECK-LABEL: define void @test_vld4_lane_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X4_T:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 0 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 1 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 2 +// CHECK-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], 3 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_0_EXTRACT]], ptr [[AGG_RESULT]], align 8 +// CHECK-NEXT: [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_1_EXTRACT]], ptr [[__RET_SROA_2_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_2_EXTRACT]], ptr [[__RET_SROA_3_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 24 +// CHECK-NEXT: store <4 x i16> [[VLD4_LANE_V_FCA_3_EXTRACT]], ptr [[__RET_SROA_4_0_AGG_RESULT_SROA_IDX]], align 8 +// CHECK-NEXT: ret void +// poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) { return vld4_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vmax_s8( -// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_V_I]] +// int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) { return vmax_s8(a, b); } -// CHECK-LABEL: @test_vmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX_V2_I]] +// int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) { return vmax_s16(a, b); } -// CHECK-LABEL: @test_vmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX_V2_I]] +// int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) { return vmax_s32(a, b); } -// CHECK-LABEL: @test_vmax_u8( -// CHECK: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMAX_V_I]] +// uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) { return vmax_u8(a, b); } -// CHECK-LABEL: @test_vmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMAX_V2_I]] +// uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) { return vmax_u16(a, b); } -// CHECK-LABEL: @test_vmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMAX_V2_I]] +// uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) { return vmax_u32(a, b); } -// CHECK-LABEL: @test_vmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VMAX_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMAX_V2_I]] +// float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) { return vmax_f32(a, b); } -// CHECK-LABEL: @test_vmaxq_s8( -// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAXQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAXQ_V_I]] +// int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) { return vmaxq_s8(a, b); } -// CHECK-LABEL: @test_vmaxq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMAXQ_V2_I]] +// int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) { return vmaxq_s16(a, b); } -// CHECK-LABEL: @test_vmaxq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMAXQ_V2_I]] +// int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) { return vmaxq_s32(a, b); } -// CHECK-LABEL: @test_vmaxq_u8( -// CHECK: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMAXQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMAXQ_V_I]] +// uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) { return vmaxq_u8(a, b); } -// CHECK-LABEL: @test_vmaxq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMAXQ_V2_I]] +// uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) { return vmaxq_u16(a, b); } -// CHECK-LABEL: @test_vmaxq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMAXQ_V2_I]] +// uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) { return vmaxq_u32(a, b); } -// CHECK-LABEL: @test_vmaxq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VMAXQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vmaxq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMAXQ_V2_I]] +// float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) { return vmaxq_f32(a, b); } -// CHECK-LABEL: @test_vmin_s8( -// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_V_I]] +// int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) { return vmin_s8(a, b); } -// CHECK-LABEL: @test_vmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN_V2_I]] +// int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) { return vmin_s16(a, b); } -// CHECK-LABEL: @test_vmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN_V2_I]] +// int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) { return vmin_s32(a, b); } -// CHECK-LABEL: @test_vmin_u8( -// CHECK: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMIN_V_I]] +// uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) { return vmin_u8(a, b); } -// CHECK-LABEL: @test_vmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VMIN_V2_I]] +// uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) { return vmin_u16(a, b); } -// CHECK-LABEL: @test_vmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VMIN_V2_I]] +// uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) { return vmin_u32(a, b); } -// CHECK-LABEL: @test_vmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VMIN_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VMIN_V2_I]] +// float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) { return vmin_f32(a, b); } -// CHECK-LABEL: @test_vminq_s8( -// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMINQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vminq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMINQ_V_I]] +// int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) { return vminq_s8(a, b); } -// CHECK-LABEL: @test_vminq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMINQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vminq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMINQ_V2_I]] +// int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) { return vminq_s16(a, b); } -// CHECK-LABEL: @test_vminq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vminq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMINQ_V2_I]] +// int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) { return vminq_s32(a, b); } -// CHECK-LABEL: @test_vminq_u8( -// CHECK: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMINQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vminq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMINQ_V_I]] +// uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) { return vminq_u8(a, b); } -// CHECK-LABEL: @test_vminq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VMINQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vminq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMINQ_V2_I]] +// uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) { return vminq_u16(a, b); } -// CHECK-LABEL: @test_vminq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vminq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMINQ_V2_I]] +// uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) { return vminq_u32(a, b); } -// CHECK-LABEL: @test_vminq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VMINQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vminq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VMINQ_V2_I]] +// float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) { return vminq_f32(a, b); } -// CHECK-LABEL: @test_vmla_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vmla_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmla_s8(a, b, c); } -// CHECK-LABEL: @test_vmla_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_s16(a, b, c); } -// CHECK-LABEL: @test_vmla_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_s32(a, b, c); } -// CHECK-LABEL: @test_vmla_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vmla_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_f32(a, b, c); } -// CHECK-LABEL: @test_vmla_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[ADD_I]] +// CHECK-LABEL: define <8 x i8> @test_vmla_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[ADD_I]] +// uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmla_u8(a, b, c); } -// CHECK-LABEL: @test_vmla_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_u16(a, b, c); } -// CHECK-LABEL: @test_vmla_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_u32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlaq_s8(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlaq_s16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlaq_s32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlaq_f32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[ADD_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[ADD_I]] +// uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlaq_u8(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlaq_u16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlaq_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlal_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlal_s8(a, b, c); } -// CHECK-LABEL: @test_vmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlal_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlal_u8(a, b, c); } -// CHECK-LABEL: @test_vmlal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_u32(a, b, c); } -// CHECK-LABEL: @test_vmlal_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlal_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[ADD]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD]] +// int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlal_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlal_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[ADD]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD]] +// uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlal_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlal_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlal_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlal_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlal_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[ADD_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[ADD_I]] +// uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlal_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmla_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[ADD]] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[ADD]] +// int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmla_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[ADD]] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[ADD]] +// int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[ADD]] +// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[ADD]] +// uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmla_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[ADD]] +// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[ADD]] +// uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x float> [[ADD]] +// float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[ADD]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[ADD]] +// int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlaq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[ADD]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[ADD]] +// uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlaq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[ADD]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[ADD]] +// uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlaq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x float> [[ADD]] +// float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmla_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmla_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmla_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmla_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmla_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[ADD_I]] +// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[ADD_I]] +// uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmla_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmla_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[ADD_I]] +// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[ADD_I]] +// uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmla_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmla_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] -// CHECK: [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[ADD_I]] +// CHECK-LABEL: define <2 x float> @test_vmla_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[ADD_I]] +// float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmla_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlaq_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlaq_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[ADD_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[ADD_I]] +// uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlaq_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[ADD_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[ADD_I]] +// uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlaq_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmlaq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] -// CHECK: [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[ADD_I]] +// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[ADD_I]] +// float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlaq_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmls_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vmls_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vmls_s8(a, b, c); } -// CHECK-LABEL: @test_vmls_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_s16(a, b, c); } -// CHECK-LABEL: @test_vmls_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_s32(a, b, c); } -// CHECK-LABEL: @test_vmls_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vmls_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_f32(a, b, c); } -// CHECK-LABEL: @test_vmls_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]] -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vmls_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vmls_u8(a, b, c); } -// CHECK-LABEL: @test_vmls_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_u16(a, b, c); } -// CHECK-LABEL: @test_vmls_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) { return vmlsq_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) { return vmlsq_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) { return vmlsq_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, %c -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) { return vmlsq_f32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]] -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) { return vmlsq_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <8 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return vmlsq_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, %c -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <4 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[C]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return vmlsq_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) { return vmlsl_s8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u8( -// CHECK: [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMULL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) { return vmlsl_u8(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsl_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[SUB]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB]] +// int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsl_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsl_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] -// CHECK: ret <2 x i64> [[SUB]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB]] +// uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsl_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vmlsl_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vmlsl_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) { return vmlsl_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsl_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMULL2_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { return vmlsl_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmls_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[SUB]] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[SUB]] +// int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmls_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[SUB]] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[SUB]] +// int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i16> [[SUB]] +// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i16> [[SUB]] +// uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmls_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x i32> [[SUB]] +// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x i32> [[SUB]] +// uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <2 x float> [[SUB]] +// float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[SUB]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[SUB]] +// int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] -// CHECK: ret <8 x i16> [[SUB]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A]], [[MUL]] +// CHECK-NEXT: ret <8 x i16> [[SUB]] +// uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } -// CHECK-LABEL: @test_vmlsq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x i32> [[SUB]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x i32> [[SUB]] +// uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } -// CHECK-LABEL: @test_vmlsq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[C]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]] +// CHECK-NEXT: [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]] +// CHECK-NEXT: ret <4 x float> [[SUB]] +// float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); } -// CHECK-LABEL: @test_vmls_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) { return vmls_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmls_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) { return vmls_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmls_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]] -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) { return vmls_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmls_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]] -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) { return vmls_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmls_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]] -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]] -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vmls_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) { return vmls_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) { return vmlsq_n_s16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { return vmlsq_n_s32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]] -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]], i16 noundef zeroext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[B]], [[VECINIT7_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) { return vmlsq_n_u16(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) { return vmlsq_n_u32(a, b, c); } -// CHECK-LABEL: @test_vmlsq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]] -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]] -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) { return vmlsq_n_f32(a, b, c); } -// CHECK-LABEL: @test_vmovl_s8( -// CHECK: [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// int16x8_t test_vmovl_s8(int8x8_t a) { return vmovl_s8(a); } -// CHECK-LABEL: @test_vmovl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// int32x4_t test_vmovl_s16(int16x4_t a) { return vmovl_s16(a); } -// CHECK-LABEL: @test_vmovl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// int64x2_t test_vmovl_s32(int32x2_t a) { return vmovl_s32(a); } -// CHECK-LABEL: @test_vmovl_u8( -// CHECK: [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[VMOVL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VMOVL_I]] +// uint16x8_t test_vmovl_u8(uint8x8_t a) { return vmovl_u8(a); } -// CHECK-LABEL: @test_vmovl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[VMOVL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VMOVL_I]] +// uint32x4_t test_vmovl_u16(uint16x4_t a) { return vmovl_u16(a); } -// CHECK-LABEL: @test_vmovl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[VMOVL_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[VMOVL_I]] +// uint64x2_t test_vmovl_u32(uint32x2_t a) { return vmovl_u32(a); } -// CHECK-LABEL: @test_vmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define <8 x i8> @test_vmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// int8x8_t test_vmovn_s16(int16x8_t a) { return vmovn_s16(a); } -// CHECK-LABEL: @test_vmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define <4 x i16> @test_vmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// int16x4_t test_vmovn_s32(int32x4_t a) { return vmovn_s32(a); } -// CHECK-LABEL: @test_vmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define <2 x i32> @test_vmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// int32x2_t test_vmovn_s64(int64x2_t a) { return vmovn_s64(a); } -// CHECK-LABEL: @test_vmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <8 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[VMOVN_I]] +// CHECK-LABEL: define <8 x i8> @test_vmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <8 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VMOVN_I]] +// uint8x8_t test_vmovn_u16(uint16x8_t a) { return vmovn_u16(a); } -// CHECK-LABEL: @test_vmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <4 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[VMOVN_I]] +// CHECK-LABEL: define <4 x i16> @test_vmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <4 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VMOVN_I]] +// uint16x4_t test_vmovn_u32(uint32x4_t a) { return vmovn_u32(a); } -// CHECK-LABEL: @test_vmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VMOVN_I:%.*]] = trunc <2 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[VMOVN_I]] +// CHECK-LABEL: define <2 x i32> @test_vmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVN_I:%.*]] = trunc <2 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VMOVN_I]] +// uint32x2_t test_vmovn_u64(uint64x2_t a) { return vmovn_u64(a); } -// CHECK-LABEL: @test_vmov_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// uint8x8_t test_vmov_n_u8(uint8_t a) { return vmov_n_u8(a); } -// CHECK-LABEL: @test_vmov_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// uint16x4_t test_vmov_n_u16(uint16_t a) { return vmov_n_u16(a); } -// CHECK-LABEL: @test_vmov_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// uint32x2_t test_vmov_n_u32(uint32_t a) { return vmov_n_u32(a); } -// CHECK-LABEL: @test_vmov_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// int8x8_t test_vmov_n_s8(int8_t a) { return vmov_n_s8(a); } -// CHECK-LABEL: @test_vmov_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// int16x4_t test_vmov_n_s16(int16_t a) { return vmov_n_s16(a); } -// CHECK-LABEL: @test_vmov_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i32> [[VECINIT1_I]] +// int32x2_t test_vmov_n_s32(int32_t a) { return vmov_n_s32(a); } -// CHECK-LABEL: @test_vmov_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// poly8x8_t test_vmov_n_p8(poly8_t a) { return vmov_n_p8(a); } -// CHECK-LABEL: @test_vmov_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i16> [[VECINIT3_I]] +// poly16x4_t test_vmov_n_p16(poly16_t a) { return vmov_n_p16(a); } -// CHECK-LABEL: @test_vmov_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: ret <4 x half> [[VECINIT3]] +// CHECK-LABEL: define <4 x half> @test_vmov_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT3:%.*]] = shufflevector <4 x half> [[VECINIT]], <4 x half> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x half> [[VECINIT3]] +// float16x4_t test_vmov_n_f16(float16_t *a) { return vmov_n_f16(*a); } -// CHECK-LABEL: @test_vmov_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: ret <2 x float> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x float> @test_vmov_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x float> [[VECINIT1_I]] +// float32x2_t test_vmov_n_f32(float32_t a) { return vmov_n_f32(a); } -// CHECK-LABEL: @test_vmovq_n_u8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// uint8x16_t test_vmovq_n_u8(uint8_t a) { return vmovq_n_u8(a); } -// CHECK-LABEL: @test_vmovq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// uint16x8_t test_vmovq_n_u16(uint16_t a) { return vmovq_n_u16(a); } -// CHECK-LABEL: @test_vmovq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// uint32x4_t test_vmovq_n_u32(uint32_t a) { return vmovq_n_u32(a); } -// CHECK-LABEL: @test_vmovq_n_s8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// int8x16_t test_vmovq_n_s8(int8_t a) { return vmovq_n_s8(a); } -// CHECK-LABEL: @test_vmovq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// int16x8_t test_vmovq_n_s16(int16_t a) { return vmovq_n_s16(a); } -// CHECK-LABEL: @test_vmovq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32( +// CHECK-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x i32> [[VECINIT3_I]] +// int32x4_t test_vmovq_n_s32(int32_t a) { return vmovq_n_s32(a); } -// CHECK-LABEL: @test_vmovq_n_p8( -// CHECK: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7 -// CHECK: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8 -// CHECK: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9 -// CHECK: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10 -// CHECK: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11 -// CHECK: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12 -// CHECK: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13 -// CHECK: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14 -// CHECK: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VECINIT15_I]] +// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// poly8x16_t test_vmovq_n_p8(poly8_t a) { return vmovq_n_p8(a); } -// CHECK-LABEL: @test_vmovq_n_p16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VECINIT7_I]] +// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i16> [[VECINIT7_I]] +// poly16x8_t test_vmovq_n_p16(poly16_t a) { return vmovq_n_p16(a); } -// CHECK-LABEL: @test_vmovq_n_f16( -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i32 0 -// CHECK: [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1 -// CHECK: [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2 -// CHECK: [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3 -// CHECK: [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4 -// CHECK: [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5 -// CHECK: [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6 -// CHECK: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7 -// CHECK: ret <8 x half> [[VECINIT7]] +// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16( +// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A]], align 2 +// CHECK-NEXT: [[VECINIT:%.*]] = insertelement <8 x half> poison, half [[TMP0]], i64 0 +// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <8 x half> [[VECINIT]], <8 x half> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x half> [[VECINIT7]] +// float16x8_t test_vmovq_n_f16(float16_t *a) { return vmovq_n_f16(*a); } -// CHECK-LABEL: @test_vmovq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3 -// CHECK: ret <4 x float> [[VECINIT3_I]] +// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32( +// CHECK-SAME: float noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: ret <4 x float> [[VECINIT3_I]] +// float32x4_t test_vmovq_n_f32(float32_t a) { return vmovq_n_f32(a); } -// CHECK-LABEL: @test_vmov_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// int64x1_t test_vmov_n_s64(int64_t a) { int64x1_t tmp = vmov_n_s64(a); return vadd_s64(tmp, tmp); } -// CHECK-LABEL: @test_vmov_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 %a, i32 0 -// CHECK: [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]] -// CHECK: ret <1 x i64> [[ADD_I]] +// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[ADD_I:%.*]] = shl <1 x i64> [[VECINIT_I]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[ADD_I]] +// uint64x1_t test_vmov_n_u64(uint64_t a) { uint64x1_t tmp = vmov_n_u64(a); return vadd_u64(tmp, tmp); } -// CHECK-LABEL: @test_vmovq_n_s64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// int64x2_t test_vmovq_n_s64(int64_t a) { return vmovq_n_s64(a); } -// CHECK-LABEL: @test_vmovq_n_u64( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 %a, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VECINIT1_I]] +// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: ret <2 x i64> [[VECINIT1_I]] +// uint64x2_t test_vmovq_n_u64(uint64_t a) { return vmovq_n_u64(a); } -// CHECK-LABEL: @test_vmul_s8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) { return vmul_s8(a, b); } -// CHECK-LABEL: @test_vmul_s16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) { return vmul_s16(a, b); } -// CHECK-LABEL: @test_vmul_s32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) { return vmul_s32(a, b); } -// CHECK-LABEL: @test_vmul_f32( -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, %b -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define <2 x float> @test_vmul_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) { return vmul_f32(a, b); } -// CHECK-LABEL: @test_vmul_u8( -// CHECK: [[MUL_I:%.*]] = mul <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[MUL_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[MUL_I]] +// uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) { return vmul_u8(a, b); } -// CHECK-LABEL: @test_vmul_u16( -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) { return vmul_u16(a, b); } -// CHECK-LABEL: @test_vmul_u32( -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) { return vmul_u32(a, b); } -// CHECK-LABEL: @test_vmulq_s8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) { return vmulq_s8(a, b); } -// CHECK-LABEL: @test_vmulq_s16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) { return vmulq_s16(a, b); } -// CHECK-LABEL: @test_vmulq_s32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) { return vmulq_s32(a, b); } -// CHECK-LABEL: @test_vmulq_f32( -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, %b -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define <4 x float> @test_vmulq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) { return vmulq_f32(a, b); } -// CHECK-LABEL: @test_vmulq_u8( -// CHECK: [[MUL_I:%.*]] = mul <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[MUL_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[MUL_I]] +// uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) { return vmulq_u8(a, b); } -// CHECK-LABEL: @test_vmulq_u16( -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) { return vmulq_u16(a, b); } -// CHECK-LABEL: @test_vmulq_u32( -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) { return vmulq_u32(a, b); } -// CHECK-LABEL: @test_vmull_s8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) { return vmull_s8(a, b); } -// CHECK-LABEL: @test_vmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) { return vmull_s16(a, b); } -// CHECK-LABEL: @test_vmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) { return vmull_s32(a, b); } -// CHECK-LABEL: @test_vmull_u8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) { return vmull_u8(a, b); } -// CHECK-LABEL: @test_vmull_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) { return vmull_u16(a, b); } -// CHECK-LABEL: @test_vmull_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) { return vmull_u32(a, b); } -// CHECK-LABEL: @test_vmull_p8( -// CHECK: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i16> [[VMULL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmull_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VMULL_I]] +// poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { return vmull_p8(a, b); } -// CHECK-LABEL: @test_vmull_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmull_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmull_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: ret <4 x i32> [[VMULL2_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] +// uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmull_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: ret <2 x i64> [[VMULL2_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] +// uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmull_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: ret <4 x i32> [[VMULL5_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) { return vmull_n_s16(a, b); } -// CHECK-LABEL: @test_vmull_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: ret <2 x i64> [[VMULL3_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) { return vmull_n_s32(a, b); } -// CHECK-LABEL: @test_vmull_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: ret <4 x i32> [[VMULL5_I]] +// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]] +// uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) { return vmull_n_u16(a, b); } -// CHECK-LABEL: @test_vmull_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: ret <2 x i64> [[VMULL3_I]] +// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]] +// uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) { return vmull_n_u32(a, b); } -// CHECK-LABEL: @test_vmul_p8( -// CHECK: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VMUL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vmul_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VMUL_V_I]] +// poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) { return vmul_p8(a, b); } -// CHECK-LABEL: @test_vmulq_p8( -// CHECK: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VMULQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vmulq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VMULQ_V_I]] +// poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) { return vmulq_p8(a, b); } -// CHECK-LABEL: @test_vmul_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i16> [[MUL]] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i16> [[MUL]] +// int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmul_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x i32> [[MUL]] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x i32> [[MUL]] +// int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmul_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x float> [[MUL]] +// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x float> [[MUL]] +// float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vmul_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i16> [[MUL]] +// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i16> [[MUL]] +// uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmul_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <2 x i32> [[MUL]] +// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <2 x i32> [[MUL]] +// uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x i16> [[MUL]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x i16> [[MUL]] +// int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vmulq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i32> [[MUL]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i32> [[MUL]] +// int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> -// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x float> [[MUL]] +// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[B]], <2 x float> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x float> [[MUL]] +// float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vmulq_lane_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] -// CHECK: ret <8 x i16> [[MUL]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A]], [[LANE]] +// CHECK-NEXT: ret <8 x i16> [[MUL]] +// uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vmulq_lane_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] -// CHECK: ret <4 x i32> [[MUL]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A]], [[LANE]] +// CHECK-NEXT: ret <4 x i32> [[MUL]] +// uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vmul_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) { return vmul_n_s16(a, b); } -// CHECK-LABEL: @test_vmul_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) { return vmul_n_s32(a, b); } -// CHECK-LABEL: @test_vmul_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1 -// CHECK: [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]] -// CHECK: ret <2 x float> [[MUL_I]] +// CHECK-LABEL: define <2 x float> @test_vmul_n_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x float> poison, float [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x float> [[VECINIT_I]], <2 x float> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <2 x float> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x float> [[MUL_I]] +// float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) { return vmul_n_f32(a, b); } -// CHECK-LABEL: @test_vmul_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i16> [[MUL_I]] +// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i16> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i16> [[MUL_I]] +// uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) { return vmul_n_u16(a, b); } -// CHECK-LABEL: @test_vmul_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]] -// CHECK: ret <2 x i32> [[MUL_I]] +// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <2 x i32> [[A]], [[VECINIT1_I]] +// CHECK-NEXT: ret <2 x i32> [[MUL_I]] +// uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) { return vmul_n_u32(a, b); } -// CHECK-LABEL: @test_vmulq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) { return vmulq_n_s16(a, b); } -// CHECK-LABEL: @test_vmulq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) { return vmulq_n_s32(a, b); } -// CHECK-LABEL: @test_vmulq_n_f32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3 -// CHECK: [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]] -// CHECK: ret <4 x float> [[MUL_I]] +// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x float> [[VECINIT_I]], <4 x float> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = fmul <4 x float> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x float> [[MUL_I]] +// float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) { return vmulq_n_f32(a, b); } -// CHECK-LABEL: @test_vmulq_n_u16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]] -// CHECK: ret <8 x i16> [[MUL_I]] +// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <8 x i16> [[A]], [[VECINIT7_I]] +// CHECK-NEXT: ret <8 x i16> [[MUL_I]] +// uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) { return vmulq_n_u16(a, b); } -// CHECK-LABEL: @test_vmulq_n_u32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]] -// CHECK: ret <4 x i32> [[MUL_I]] +// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL_I:%.*]] = mul <4 x i32> [[A]], [[VECINIT3_I]] +// CHECK-NEXT: ret <4 x i32> [[MUL_I]] +// uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) { return vmulq_n_u32(a, b); } -// CHECK-LABEL: @test_vmvn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// int8x8_t test_vmvn_s8(int8x8_t a) { return vmvn_s8(a); } -// CHECK-LABEL: @test_vmvn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define <4 x i16> @test_vmvn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// int16x4_t test_vmvn_s16(int16x4_t a) { return vmvn_s16(a); } -// CHECK-LABEL: @test_vmvn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define <2 x i32> @test_vmvn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// int32x2_t test_vmvn_s32(int32x2_t a) { return vmvn_s32(a); } -// CHECK-LABEL: @test_vmvn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// uint8x8_t test_vmvn_u8(uint8x8_t a) { return vmvn_u8(a); } -// CHECK-LABEL: @test_vmvn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %a, splat (i16 -1) -// CHECK: ret <4 x i16> [[NEG_I]] +// CHECK-LABEL: define <4 x i16> @test_vmvn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <4 x i16> [[NOT_I]] +// uint16x4_t test_vmvn_u16(uint16x4_t a) { return vmvn_u16(a); } -// CHECK-LABEL: @test_vmvn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %a, splat (i32 -1) -// CHECK: ret <2 x i32> [[NEG_I]] +// CHECK-LABEL: define <2 x i32> @test_vmvn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <2 x i32> [[NOT_I]] +// uint32x2_t test_vmvn_u32(uint32x2_t a) { return vmvn_u32(a); } -// CHECK-LABEL: @test_vmvn_p8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %a, splat (i8 -1) -// CHECK: ret <8 x i8> [[NEG_I]] +// CHECK-LABEL: define <8 x i8> @test_vmvn_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <8 x i8> [[NOT_I]] +// poly8x8_t test_vmvn_p8(poly8x8_t a) { return vmvn_p8(a); } -// CHECK-LABEL: @test_vmvnq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// int8x16_t test_vmvnq_s8(int8x16_t a) { return vmvnq_s8(a); } -// CHECK-LABEL: @test_vmvnq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// int16x8_t test_vmvnq_s16(int16x8_t a) { return vmvnq_s16(a); } -// CHECK-LABEL: @test_vmvnq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// int32x4_t test_vmvnq_s32(int32x4_t a) { return vmvnq_s32(a); } -// CHECK-LABEL: @test_vmvnq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// uint8x16_t test_vmvnq_u8(uint8x16_t a) { return vmvnq_u8(a); } -// CHECK-LABEL: @test_vmvnq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %a, splat (i16 -1) -// CHECK: ret <8 x i16> [[NEG_I]] +// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[A]], splat (i16 -1) +// CHECK-NEXT: ret <8 x i16> [[NOT_I]] +// uint16x8_t test_vmvnq_u16(uint16x8_t a) { return vmvnq_u16(a); } -// CHECK-LABEL: @test_vmvnq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %a, splat (i32 -1) -// CHECK: ret <4 x i32> [[NEG_I]] +// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[A]], splat (i32 -1) +// CHECK-NEXT: ret <4 x i32> [[NOT_I]] +// uint32x4_t test_vmvnq_u32(uint32x4_t a) { return vmvnq_u32(a); } -// CHECK-LABEL: @test_vmvnq_p8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %a, splat (i8 -1) -// CHECK: ret <16 x i8> [[NEG_I]] +// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[A]], splat (i8 -1) +// CHECK-NEXT: ret <16 x i8> [[NOT_I]] +// poly8x16_t test_vmvnq_p8(poly8x16_t a) { return vmvnq_p8(a); } -// CHECK-LABEL: @test_vneg_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vneg_s8(int8x8_t a) { return vneg_s8(a); } -// CHECK-LABEL: @test_vneg_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vneg_s16(int16x4_t a) { return vneg_s16(a); } -// CHECK-LABEL: @test_vneg_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vneg_s32(int32x2_t a) { return vneg_s32(a); } -// CHECK-LABEL: @test_vneg_f32( -// CHECK: [[SUB_I:%.*]] = fneg <2 x float> %a -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vneg_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <2 x float> [[A]] +// CHECK-NEXT: ret <2 x float> [[FNEG_I]] +// float32x2_t test_vneg_f32(float32x2_t a) { return vneg_f32(a); } -// CHECK-LABEL: @test_vnegq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, [[A]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vnegq_s8(int8x16_t a) { return vnegq_s8(a); } -// CHECK-LABEL: @test_vnegq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, [[A]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vnegq_s16(int16x8_t a) { return vnegq_s16(a); } -// CHECK-LABEL: @test_vnegq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[A]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vnegq_s32(int32x4_t a) { return vnegq_s32(a); } -// CHECK-LABEL: @test_vnegq_f32( -// CHECK: [[SUB_I:%.*]] = fneg <4 x float> %a -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vnegq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[FNEG_I:%.*]] = fneg <4 x float> [[A]] +// CHECK-NEXT: ret <4 x float> [[FNEG_I]] +// float32x4_t test_vnegq_f32(float32x4_t a) { return vnegq_f32(a); } -// CHECK-LABEL: @test_vorn_s8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorn_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) { return vorn_s8(a, b); } -// CHECK-LABEL: @test_vorn_s16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorn_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) { return vorn_s16(a, b); } -// CHECK-LABEL: @test_vorn_s32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorn_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) { return vorn_s32(a, b); } -// CHECK-LABEL: @test_vorn_s64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorn_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) { return vorn_s64(a, b); } -// CHECK-LABEL: @test_vorn_u8( -// CHECK: [[NEG_I:%.*]] = xor <8 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]] -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorn_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) { return vorn_u8(a, b); } -// CHECK-LABEL: @test_vorn_u16( -// CHECK: [[NEG_I:%.*]] = xor <4 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]] -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorn_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) { return vorn_u16(a, b); } -// CHECK-LABEL: @test_vorn_u32( -// CHECK: [[NEG_I:%.*]] = xor <2 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]] -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorn_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) { return vorn_u32(a, b); } -// CHECK-LABEL: @test_vorn_u64( -// CHECK: [[NEG_I:%.*]] = xor <1 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]] -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorn_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <1 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) { return vorn_u64(a, b); } -// CHECK-LABEL: @test_vornq_s8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vornq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) { return vornq_s8(a, b); } -// CHECK-LABEL: @test_vornq_s16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vornq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) { return vornq_s16(a, b); } -// CHECK-LABEL: @test_vornq_s32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vornq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) { return vornq_s32(a, b); } -// CHECK-LABEL: @test_vornq_s64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vornq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) { return vornq_s64(a, b); } -// CHECK-LABEL: @test_vornq_u8( -// CHECK: [[NEG_I:%.*]] = xor <16 x i8> %b, splat (i8 -1) -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]] -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vornq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <16 x i8> [[B]], splat (i8 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) { return vornq_u8(a, b); } -// CHECK-LABEL: @test_vornq_u16( -// CHECK: [[NEG_I:%.*]] = xor <8 x i16> %b, splat (i16 -1) -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]] -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vornq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <8 x i16> [[B]], splat (i16 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) { return vornq_u16(a, b); } -// CHECK-LABEL: @test_vornq_u32( -// CHECK: [[NEG_I:%.*]] = xor <4 x i32> %b, splat (i32 -1) -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]] -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vornq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <4 x i32> [[B]], splat (i32 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) { return vornq_u32(a, b); } -// CHECK-LABEL: @test_vornq_u64( -// CHECK: [[NEG_I:%.*]] = xor <2 x i64> %b, splat (i64 -1) -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]] -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vornq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[NOT_I:%.*]] = xor <2 x i64> [[B]], splat (i64 -1) +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[NOT_I]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) { return vornq_u64(a, b); } -// CHECK-LABEL: @test_vorr_s8( -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorr_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) { return vorr_s8(a, b); } -// CHECK-LABEL: @test_vorr_s16( -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorr_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) { return vorr_s16(a, b); } -// CHECK-LABEL: @test_vorr_s32( -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorr_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) { return vorr_s32(a, b); } -// CHECK-LABEL: @test_vorr_s64( -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorr_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) { return vorr_s64(a, b); } -// CHECK-LABEL: @test_vorr_u8( -// CHECK: [[OR_I:%.*]] = or <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[OR_I]] +// CHECK-LABEL: define <8 x i8> @test_vorr_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[OR_I]] +// uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) { return vorr_u8(a, b); } -// CHECK-LABEL: @test_vorr_u16( -// CHECK: [[OR_I:%.*]] = or <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[OR_I]] +// CHECK-LABEL: define <4 x i16> @test_vorr_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[OR_I]] +// uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) { return vorr_u16(a, b); } -// CHECK-LABEL: @test_vorr_u32( -// CHECK: [[OR_I:%.*]] = or <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[OR_I]] +// CHECK-LABEL: define <2 x i32> @test_vorr_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[OR_I]] +// uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) { return vorr_u32(a, b); } -// CHECK-LABEL: @test_vorr_u64( -// CHECK: [[OR_I:%.*]] = or <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[OR_I]] +// CHECK-LABEL: define <1 x i64> @test_vorr_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[OR_I]] +// uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) { return vorr_u64(a, b); } -// CHECK-LABEL: @test_vorrq_s8( -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vorrq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) { return vorrq_s8(a, b); } -// CHECK-LABEL: @test_vorrq_s16( -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vorrq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) { return vorrq_s16(a, b); } -// CHECK-LABEL: @test_vorrq_s32( -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vorrq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) { return vorrq_s32(a, b); } -// CHECK-LABEL: @test_vorrq_s64( -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vorrq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) { return vorrq_s64(a, b); } -// CHECK-LABEL: @test_vorrq_u8( -// CHECK: [[OR_I:%.*]] = or <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[OR_I]] +// CHECK-LABEL: define <16 x i8> @test_vorrq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[OR_I]] +// uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) { return vorrq_u8(a, b); } -// CHECK-LABEL: @test_vorrq_u16( -// CHECK: [[OR_I:%.*]] = or <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[OR_I]] +// CHECK-LABEL: define <8 x i16> @test_vorrq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[OR_I]] +// uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) { return vorrq_u16(a, b); } -// CHECK-LABEL: @test_vorrq_u32( -// CHECK: [[OR_I:%.*]] = or <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[OR_I]] +// CHECK-LABEL: define <4 x i32> @test_vorrq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[OR_I]] +// uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) { return vorrq_u32(a, b); } -// CHECK-LABEL: @test_vorrq_u64( -// CHECK: [[OR_I:%.*]] = or <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[OR_I]] +// CHECK-LABEL: define <2 x i64> @test_vorrq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OR_I:%.*]] = or <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[OR_I]] +// uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) { return vorrq_u64(a, b); } -// CHECK-LABEL: @test_vpadal_s8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) -// CHECK: ret <4 x i16> [[VPADAL_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadal_s8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADAL_V1_I]] +// int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) { return vpadal_s8(a, b); } -// CHECK-LABEL: @test_vpadal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) -// CHECK: ret <2 x i32> [[VPADAL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadal_s16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADAL_V2_I]] +// int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) { return vpadal_s16(a, b); } -// CHECK-LABEL: @test_vpadal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) -// CHECK: ret <1 x i64> [[VPADAL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vpadal_s32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VPADAL_V2_I]] +// int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) { return vpadal_s32(a, b); } -// CHECK-LABEL: @test_vpadal_u8( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> %a, <8 x i8> %b) -// CHECK: ret <4 x i16> [[VPADAL_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadal_u8( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADAL_V1_I]] +// uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) { return vpadal_u8(a, b); } -// CHECK-LABEL: @test_vpadal_u16( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> %a, <4 x i16> %b) -// CHECK: ret <2 x i32> [[VPADAL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadal_u16( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADAL_V2_I]] +// uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) { return vpadal_u16(a, b); } -// CHECK-LABEL: @test_vpadal_u32( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> %a, <2 x i32> %b) -// CHECK: ret <1 x i64> [[VPADAL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vpadal_u32( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VPADAL_V2_I]] +// uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) { return vpadal_u32(a, b); } -// CHECK-LABEL: @test_vpadalq_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) -// CHECK: ret <8 x i16> [[VPADALQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADALQ_V1_I]] +// int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) { return vpadalq_s8(a, b); } -// CHECK-LABEL: @test_vpadalq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) -// CHECK: ret <4 x i32> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPADALQ_V2_I]] +// int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) { return vpadalq_s16(a, b); } -// CHECK-LABEL: @test_vpadalq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) -// CHECK: ret <2 x i64> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VPADALQ_V2_I]] +// int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) { return vpadalq_s32(a, b); } -// CHECK-LABEL: @test_vpadalq_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> %a, <16 x i8> %b) -// CHECK: ret <8 x i16> [[VPADALQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VPADALQ_V1_I]] +// uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) { return vpadalq_u8(a, b); } -// CHECK-LABEL: @test_vpadalq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> %a, <8 x i16> %b) -// CHECK: ret <4 x i32> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VPADALQ_V2_I]] +// uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) { return vpadalq_u16(a, b); } -// CHECK-LABEL: @test_vpadalq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> %a, <4 x i32> %b) -// CHECK: ret <2 x i64> [[VPADALQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VPADALQ_V2_I]] +// uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) { return vpadalq_u32(a, b); } -// CHECK-LABEL: @test_vpadd_s8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) { return vpadd_s8(a, b); } -// CHECK-LABEL: @test_vpadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADD_V2_I]] +// int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) { return vpadd_s16(a, b); } -// CHECK-LABEL: @test_vpadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADD_V2_I]] +// int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) { return vpadd_s32(a, b); } -// CHECK-LABEL: @test_vpadd_u8( -// CHECK: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPADD_V_I]] +// uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) { return vpadd_u8(a, b); } -// CHECK-LABEL: @test_vpadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPADD_V2_I]] +// uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) { return vpadd_u16(a, b); } -// CHECK-LABEL: @test_vpadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPADD_V2_I]] +// uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) { return vpadd_u32(a, b); } -// CHECK-LABEL: @test_vpadd_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPADD_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpadd_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPADD_V2_I]] +// float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) { return vpadd_f32(a, b); } -// CHECK-LABEL: @test_vpaddl_s8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// int16x4_t test_vpaddl_s8(int8x8_t a) { return vpaddl_s8(a); } -// CHECK-LABEL: @test_vpaddl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// int32x2_t test_vpaddl_s16(int16x4_t a) { return vpaddl_s16(a); } -// CHECK-LABEL: @test_vpaddl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// int64x1_t test_vpaddl_s32(int32x2_t a) { return vpaddl_s32(a); } -// CHECK-LABEL: @test_vpaddl_u8( -// CHECK: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) -// CHECK: ret <4 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VPADDL_I]] +// uint16x4_t test_vpaddl_u8(uint8x8_t a) { return vpaddl_u8(a); } -// CHECK-LABEL: @test_vpaddl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a) -// CHECK: ret <2 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VPADDL1_I]] +// uint32x2_t test_vpaddl_u16(uint16x4_t a) { return vpaddl_u16(a); } -// CHECK-LABEL: @test_vpaddl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %a) -// CHECK: ret <1 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <1 x i64> [[VPADDL1_I]] +// uint64x1_t test_vpaddl_u32(uint32x2_t a) { return vpaddl_u32(a); } -// CHECK-LABEL: @test_vpaddlq_s8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// int16x8_t test_vpaddlq_s8(int8x16_t a) { return vpaddlq_s8(a); } -// CHECK-LABEL: @test_vpaddlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// int32x4_t test_vpaddlq_s16(int16x8_t a) { return vpaddlq_s16(a); } -// CHECK-LABEL: @test_vpaddlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// int64x2_t test_vpaddlq_s32(int32x4_t a) { return vpaddlq_s32(a); } -// CHECK-LABEL: @test_vpaddlq_u8( -// CHECK: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) -// CHECK: ret <8 x i16> [[VPADDL_I]] +// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VPADDL_I]] +// uint16x8_t test_vpaddlq_u8(uint8x16_t a) { return vpaddlq_u8(a); } -// CHECK-LABEL: @test_vpaddlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a) -// CHECK: ret <4 x i32> [[VPADDL1_I]] +// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VPADDL1_I]] +// uint32x4_t test_vpaddlq_u16(uint16x8_t a) { return vpaddlq_u16(a); } -// CHECK-LABEL: @test_vpaddlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a) -// CHECK: ret <2 x i64> [[VPADDL1_I]] +// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i64> [[VPADDL1_I]] +// uint64x2_t test_vpaddlq_u32(uint32x4_t a) { return vpaddlq_u32(a); } -// CHECK-LABEL: @test_vpmax_s8( -// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmax_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_V_I]] +// int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) { return vpmax_s8(a, b); } -// CHECK-LABEL: @test_vpmax_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmax_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX_V2_I]] +// int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) { return vpmax_s16(a, b); } -// CHECK-LABEL: @test_vpmax_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmax_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX_V2_I]] +// int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) { return vpmax_s32(a, b); } -// CHECK-LABEL: @test_vpmax_u8( -// CHECK: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMAX_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmax_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMAX_V_I]] +// uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) { return vpmax_u8(a, b); } -// CHECK-LABEL: @test_vpmax_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMAX_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmax_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMAX_V2_I]] +// uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) { return vpmax_u16(a, b); } -// CHECK-LABEL: @test_vpmax_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmax_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMAX_V2_I]] +// uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) { return vpmax_u32(a, b); } -// CHECK-LABEL: @test_vpmax_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPMAX_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpmax_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMAX_V2_I]] +// float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) { return vpmax_f32(a, b); } -// CHECK-LABEL: @test_vpmin_s8( -// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmin_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_V_I]] +// int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) { return vpmin_s8(a, b); } -// CHECK-LABEL: @test_vpmin_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmin_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN_V2_I]] +// int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) { return vpmin_s16(a, b); } -// CHECK-LABEL: @test_vpmin_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmin_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN_V2_I]] +// int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) { return vpmin_s32(a, b); } -// CHECK-LABEL: @test_vpmin_u8( -// CHECK: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VPMIN_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vpmin_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VPMIN_V_I]] +// uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) { return vpmin_u8(a, b); } -// CHECK-LABEL: @test_vpmin_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VPMIN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vpmin_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VPMIN_V2_I]] +// uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) { return vpmin_u16(a, b); } -// CHECK-LABEL: @test_vpmin_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vpmin_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VPMIN_V2_I]] +// uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) { return vpmin_u32(a, b); } -// CHECK-LABEL: @test_vpmin_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VPMIN_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vpmin_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VPMIN_V2_I]] +// float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) { return vpmin_f32(a, b); } -// CHECK-LABEL: @test_vqabs_s8( -// CHECK: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQABS_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqabs_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQABS_V_I]] +// int8x8_t test_vqabs_s8(int8x8_t a) { return vqabs_s8(a); } -// CHECK-LABEL: @test_vqabs_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQABS_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqabs_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQABS_V1_I]] +// int16x4_t test_vqabs_s16(int16x4_t a) { return vqabs_s16(a); } -// CHECK-LABEL: @test_vqabs_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %a) -// CHECK: [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQABS_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqabs_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQABS_V1_I]] +// int32x2_t test_vqabs_s32(int32x2_t a) { return vqabs_s32(a); } -// CHECK-LABEL: @test_vqabsq_s8( -// CHECK: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQABSQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQABSQ_V_I]] +// int8x16_t test_vqabsq_s8(int8x16_t a) { return vqabsq_s8(a); } -// CHECK-LABEL: @test_vqabsq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQABSQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VQABSQ_V1_I]] +// int16x8_t test_vqabsq_s16(int16x8_t a) { return vqabsq_s16(a); } -// CHECK-LABEL: @test_vqabsq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %a) -// CHECK: [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQABSQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VQABSQ_V1_I]] +// int32x4_t test_vqabsq_s32(int32x4_t a) { return vqabsq_s32(a); } -// CHECK-LABEL: @test_vqadd_s8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) { return vqadd_s8(a, b); } -// CHECK-LABEL: @test_vqadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) { return vqadd_s16(a, b); } -// CHECK-LABEL: @test_vqadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) { return vqadd_s32(a, b); } -// CHECK-LABEL: @test_vqadd_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqadd_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQADD_V2_I]] +// int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) { return vqadd_s64(a, b); } -// CHECK-LABEL: @test_vqadd_u8( -// CHECK: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQADD_V_I]] +// uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) { return vqadd_u8(a, b); } -// CHECK-LABEL: @test_vqadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQADD_V2_I]] +// uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) { return vqadd_u16(a, b); } -// CHECK-LABEL: @test_vqadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQADD_V2_I]] +// uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) { return vqadd_u32(a, b); } -// CHECK-LABEL: @test_vqadd_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQADD_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqadd_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQADD_V2_I]] +// uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) { return vqadd_u64(a, b); } -// CHECK-LABEL: @test_vqaddq_s8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) { return vqaddq_s8(a, b); } -// CHECK-LABEL: @test_vqaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) { return vqaddq_s16(a, b); } -// CHECK-LABEL: @test_vqaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) { return vqaddq_s32(a, b); } -// CHECK-LABEL: @test_vqaddq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQADDQ_V2_I]] +// int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) { return vqaddq_s64(a, b); } -// CHECK-LABEL: @test_vqaddq_u8( -// CHECK: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQADDQ_V_I]] +// uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) { return vqaddq_u8(a, b); } -// CHECK-LABEL: @test_vqaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQADDQ_V2_I]] +// uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) { return vqaddq_u16(a, b); } -// CHECK-LABEL: @test_vqaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQADDQ_V2_I]] +// uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) { return vqaddq_u32(a, b); } -// CHECK-LABEL: @test_vqaddq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQADDQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQADDQ_V2_I]] +// uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) { return vqaddq_u64(a, b); } -// CHECK-LABEL: @test_vqdmlal_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] +// int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vqdmlal_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] +// int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vqdmlal_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) -// CHECK: ret <4 x i32> [[VQDMLAL_V6_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]] +// int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlal_n_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlal_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) -// CHECK: ret <2 x i64> [[VQDMLAL_V4_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]] +// int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlal_n_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[C]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[C]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], <4 x i16> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[C]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] +// int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } -// CHECK-LABEL: @test_vqdmlsl_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], <2 x i32> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[C]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) +// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] +// int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); } -// CHECK-LABEL: @test_vqdmlsl_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]]) -// CHECK: ret <4 x i32> [[VQDMLSL_V6_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]], i16 noundef signext [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]] +// int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) { return vqdmlsl_n_s16(a, b, c); } -// CHECK-LABEL: @test_vqdmlsl_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %c, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]]) -// CHECK: ret <2 x i64> [[VQDMLSL_V4_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]], i32 noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[C]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]] +// int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { return vqdmlsl_n_s32(a, b, c); } -// CHECK-LABEL: @test_vqdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) { return vqdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) { return vqdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) { return vqdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { return vqdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqdmulh_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]] +// int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmulh_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]] +// int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmulhq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]] +// int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmulhq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) -// CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]] +// int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmulh_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQDMULH_V5_I]] +// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I_I]] +// int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) { return vqdmulh_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmulh_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQDMULH_V3_I]] +// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I_I]] +// int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) { return vqdmulh_n_s32(a, b); } -// CHECK-LABEL: @test_vqdmulhq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK: [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) -// CHECK: [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQDMULHQ_V9_I]] +// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) +// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I_I]] +// int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) { return vqdmulhq_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmulhq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK: [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) -// CHECK: [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULHQ_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I_I]] +// int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) { return vqdmulhq_n_s32(a, b); } -// CHECK-LABEL: @test_vqdmull_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) { return vqdmull_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { return vqdmull_s32(a, b); } -// CHECK-LABEL: @test_vqdmull_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] +// int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { return vqdmull_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqdmull_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] +// int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { return vqdmull_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqdmull_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQDMULL_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]] +// int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) { return vqdmull_n_s16(a, b); } -// CHECK-LABEL: @test_vqdmull_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQDMULL_V3_I]] +// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]] +// int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) { return vqdmull_n_s32(a, b); } -// CHECK-LABEL: @test_vqmovn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// int8x8_t test_vqmovn_s16(int16x8_t a) { return vqmovn_s16(a); } -// CHECK-LABEL: @test_vqmovn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVN_V1_I]] +// int16x4_t test_vqmovn_s32(int32x4_t a) { return vqmovn_s32(a); } -// CHECK-LABEL: @test_vqmovn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVN_V1_I]] +// int32x2_t test_vqmovn_s64(int64x2_t a) { return vqmovn_s64(a); } -// CHECK-LABEL: @test_vqmovn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVN_V1_I]] +// uint8x8_t test_vqmovn_u16(uint16x8_t a) { return vqmovn_u16(a); } -// CHECK-LABEL: @test_vqmovn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVN_V1_I]] +// uint16x4_t test_vqmovn_u32(uint32x4_t a) { return vqmovn_u32(a); } -// CHECK-LABEL: @test_vqmovn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVN_V1_I]] +// uint32x2_t test_vqmovn_u64(uint64x2_t a) { return vqmovn_u64(a); } -// CHECK-LABEL: @test_vqmovun_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %a) -// CHECK: ret <8 x i8> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQMOVUN_V1_I]] +// uint8x8_t test_vqmovun_s16(int16x8_t a) { return vqmovun_s16(a); } -// CHECK-LABEL: @test_vqmovun_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQMOVUN_V1_I]] +// uint16x4_t test_vqmovun_s32(int32x4_t a) { return vqmovun_s32(a); } -// CHECK-LABEL: @test_vqmovun_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %a) -// CHECK: [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQMOVUN_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQMOVUN_V1_I]] +// uint32x2_t test_vqmovun_s64(int64x2_t a) { return vqmovun_s64(a); } -// CHECK-LABEL: @test_vqneg_s8( -// CHECK: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) -// CHECK: ret <8 x i8> [[VQNEG_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqneg_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> [[A]]) +// CHECK-NEXT: ret <8 x i8> [[VQNEG_V_I]] +// int8x8_t test_vqneg_s8(int8x8_t a) { return vqneg_s8(a); } -// CHECK-LABEL: @test_vqneg_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQNEG_V1_I]] +// CHECK-LABEL: define <4 x i16> @test_vqneg_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[A]]) +// CHECK-NEXT: ret <4 x i16> [[VQNEG_V1_I]] +// int16x4_t test_vqneg_s16(int16x4_t a) { return vqneg_s16(a); } -// CHECK-LABEL: @test_vqneg_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %a) -// CHECK: [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQNEG_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vqneg_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VQNEG_V1_I]] +// int32x2_t test_vqneg_s32(int32x2_t a) { return vqneg_s32(a); } -// CHECK-LABEL: @test_vqnegq_s8( -// CHECK: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) -// CHECK: ret <16 x i8> [[VQNEGQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> [[A]]) +// CHECK-NEXT: ret <16 x i8> [[VQNEGQ_V_I]] +// int8x16_t test_vqnegq_s8(int8x16_t a) { return vqnegq_s8(a); } -// CHECK-LABEL: @test_vqnegq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[A]]) +// CHECK-NEXT: ret <8 x i16> [[VQNEGQ_V1_I]] +// int16x8_t test_vqnegq_s16(int16x8_t a) { return vqnegq_s16(a); } -// CHECK-LABEL: @test_vqnegq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %a) -// CHECK: [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQNEGQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VQNEGQ_V1_I]] +// int32x4_t test_vqnegq_s32(int32x4_t a) { return vqnegq_s32(a); } -// CHECK-LABEL: @test_vqrdmulh_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) { return vqrdmulhq_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { return vqrdmulhq_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulh_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]] +// int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { return vqrdmulh_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrdmulh_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) -// CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]] +// int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { return vqrdmulh_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqrdmulhq_lane_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[B]], <4 x i16> poison, <8 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]] +// int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { return vqrdmulhq_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vqrdmulhq_lane_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) -// CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[B]], <2 x i32> poison, <4 x i32> +// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]] +// int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { return vqrdmulhq_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vqrdmulh_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8> -// CHECK: [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[VECINIT3_I]]) -// CHECK: [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRDMULH_V5_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i16> [[VECINIT_I]], <4 x i16> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I_I]] +// int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) { return vqrdmulh_n_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulh_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8> -// CHECK: [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[VECINIT1_I]]) -// CHECK: [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRDMULH_V3_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <2 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = shufflevector <2 x i32> [[VECINIT_I]], <2 x i32> poison, <2 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULH_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) +// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I_I]] +// int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) { return vqrdmulh_n_s32(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_n_s16( -// CHECK: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3 -// CHECK: [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4 -// CHECK: [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5 -// CHECK: [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6 -// CHECK: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7 -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[VECINIT7_I]]) -// CHECK: [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRDMULHQ_V9_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], i16 noundef signext [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) +// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I_I]] +// int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) { return vqrdmulhq_n_s16(a, b); } -// CHECK-LABEL: @test_vqrdmulhq_n_s32( -// CHECK: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 %b, i32 0 -// CHECK: [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1 -// CHECK: [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2 -// CHECK: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3 -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[VECINIT3_I]]) -// CHECK: [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRDMULHQ_V5_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer +// CHECK-NEXT: [[VQRDMULHQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) +// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I_I]] +// int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) { return vqrdmulhq_n_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) { return vqrshl_s8(a, b); } -// CHECK-LABEL: @test_vqrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRSHL_V2_I]] +// int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) { return vqrshl_s16(a, b); } -// CHECK-LABEL: @test_vqrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRSHL_V2_I]] +// int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) { return vqrshl_s32(a, b); } -// CHECK-LABEL: @test_vqrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQRSHL_V2_I]] +// int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) { return vqrshl_s64(a, b); } -// CHECK-LABEL: @test_vqrshl_u8( -// CHECK: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQRSHL_V_I]] +// uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) { return vqrshl_u8(a, b); } -// CHECK-LABEL: @test_vqrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQRSHL_V2_I]] +// uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) { return vqrshl_u16(a, b); } -// CHECK-LABEL: @test_vqrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQRSHL_V2_I]] +// uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) { return vqrshl_u32(a, b); } -// CHECK-LABEL: @test_vqrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQRSHL_V2_I]] +// uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) { return vqrshl_u64(a, b); } -// CHECK-LABEL: @test_vqrshlq_s8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) { return vqrshlq_s8(a, b); } -// CHECK-LABEL: @test_vqrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRSHLQ_V2_I]] +// int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) { return vqrshlq_s16(a, b); } -// CHECK-LABEL: @test_vqrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRSHLQ_V2_I]] +// int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) { return vqrshlq_s32(a, b); } -// CHECK-LABEL: @test_vqrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQRSHLQ_V2_I]] +// int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) { return vqrshlq_s64(a, b); } -// CHECK-LABEL: @test_vqrshlq_u8( -// CHECK: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQRSHLQ_V_I]] +// uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) { return vqrshlq_u8(a, b); } -// CHECK-LABEL: @test_vqrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQRSHLQ_V2_I]] +// uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) { return vqrshlq_u16(a, b); } -// CHECK-LABEL: @test_vqrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQRSHLQ_V2_I]] +// uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) { return vqrshlq_u32(a, b); } -// CHECK-LABEL: @test_vqrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQRSHLQ_V2_I]] +// uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) { return vqrshlq_u64(a, b); } -// CHECK-LABEL: @test_vqrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// int8x8_t test_vqrshrn_n_s16(int16x8_t a) { return vqrshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// int16x4_t test_vqrshrn_n_s32(int32x4_t a) { return vqrshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// int32x2_t test_vqrshrn_n_s64(int64x2_t a) { return vqrshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRN_N1]] +// uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) { return vqrshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRN_N1]] +// uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) { return vqrshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vqrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRN_N1]] +// uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) { return vqrshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQRSHRUN_N1]] +// uint8x8_t test_vqrshrun_n_s16(int16x8_t a) { return vqrshrun_n_s16(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQRSHRUN_N1]] +// uint16x4_t test_vqrshrun_n_s32(int32x4_t a) { return vqrshrun_n_s32(a, 1); } -// CHECK-LABEL: @test_vqrshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQRSHRUN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQRSHRUN_N1]] +// uint32x2_t test_vqrshrun_n_s64(int64x2_t a) { return vqrshrun_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_s8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) { return vqshl_s8(a, b); } -// CHECK-LABEL: @test_vqshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_V2_I]] +// int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) { return vqshl_s16(a, b); } -// CHECK-LABEL: @test_vqshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_V2_I]] +// int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) { return vqshl_s32(a, b); } -// CHECK-LABEL: @test_vqshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_V2_I]] +// int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) { return vqshl_s64(a, b); } -// CHECK-LABEL: @test_vqshl_u8( -// CHECK: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_V_I]] +// uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) { return vqshl_u8(a, b); } -// CHECK-LABEL: @test_vqshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_V2_I]] +// uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) { return vqshl_u16(a, b); } -// CHECK-LABEL: @test_vqshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_V2_I]] +// uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) { return vqshl_u32(a, b); } -// CHECK-LABEL: @test_vqshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_V2_I]] +// uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) { return vqshl_u64(a, b); } -// CHECK-LABEL: @test_vqshlq_s8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) { return vqshlq_s8(a, b); } -// CHECK-LABEL: @test_vqshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSHLQ_V2_I]] +// int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) { return vqshlq_s16(a, b); } -// CHECK-LABEL: @test_vqshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSHLQ_V2_I]] +// int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) { return vqshlq_s32(a, b); } -// CHECK-LABEL: @test_vqshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSHLQ_V2_I]] +// int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) { return vqshlq_s64(a, b); } -// CHECK-LABEL: @test_vqshlq_u8( -// CHECK: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSHLQ_V_I]] +// uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) { return vqshlq_u8(a, b); } -// CHECK-LABEL: @test_vqshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSHLQ_V2_I]] +// uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) { return vqshlq_u16(a, b); } -// CHECK-LABEL: @test_vqshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSHLQ_V2_I]] +// uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) { return vqshlq_u32(a, b); } -// CHECK-LABEL: @test_vqshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSHLQ_V2_I]] +// uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) { return vqshlq_u64(a, b); } -// CHECK-LABEL: @test_vqshlu_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1) -// CHECK: ret <8 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHLU_N]] +// uint8x8_t test_vqshlu_n_s8(int8x8_t a) { return vqshlu_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> splat (i16 1) -// CHECK: ret <4 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHLU_N1]] +// uint16x4_t test_vqshlu_n_s16(int16x4_t a) { return vqshlu_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHLU_N1]] +// uint32x2_t test_vqshlu_n_s32(int32x2_t a) { return vqshlu_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshlu_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHLU_N1]] +// uint64x1_t test_vqshlu_n_s64(int64x1_t a) { return vqshlu_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s8( -// CHECK: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHLU_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHLU_N]] +// uint8x16_t test_vqshluq_n_s8(int8x16_t a) { return vqshluq_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHLU_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHLU_N1]] +// uint16x8_t test_vqshluq_n_s16(int16x8_t a) { return vqshluq_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHLU_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHLU_N1]] +// uint32x4_t test_vqshluq_n_s32(int32x4_t a) { return vqshluq_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshluq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHLU_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHLU_N1]] +// uint64x2_t test_vqshluq_n_s64(int64x2_t a) { return vqshluq_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// int8x8_t test_vqshl_n_s8(int8x8_t a) { return vqshl_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// int16x4_t test_vqshl_n_s16(int16x4_t a) { return vqshl_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// int32x2_t test_vqshl_n_s32(int32x2_t a) { return vqshl_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// int64x1_t test_vqshl_n_s64(int64x1_t a) { return vqshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHL_N]] +// uint8x8_t test_vqshl_n_u8(uint8x8_t a) { return vqshl_n_u8(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHL_N1]] +// uint16x4_t test_vqshl_n_u16(uint16x4_t a) { return vqshl_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHL_N1]] +// uint32x2_t test_vqshl_n_u32(uint32x2_t a) { return vqshl_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VQSHL_N1]] +// uint64x1_t test_vqshl_n_u64(uint64x1_t a) { return vqshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// int8x16_t test_vqshlq_n_s8(int8x16_t a) { return vqshlq_n_s8(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// int16x8_t test_vqshlq_n_s16(int16x8_t a) { return vqshlq_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// int32x4_t test_vqshlq_n_s32(int32x4_t a) { return vqshlq_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// int64x2_t test_vqshlq_n_s64(int64x2_t a) { return vqshlq_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u8( -// CHECK: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VQSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VQSHL_N]] +// uint8x16_t test_vqshlq_n_u8(uint8x16_t a) { return vqshlq_n_u8(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VQSHL_N1]] +// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VQSHL_N1]] +// uint16x8_t test_vqshlq_n_u16(uint16x8_t a) { return vqshlq_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VQSHL_N1]] +// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VQSHL_N1]] +// uint32x4_t test_vqshlq_n_u32(uint32x4_t a) { return vqshlq_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VQSHL_N1]] +// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VQSHL_N1]] +// uint64x2_t test_vqshlq_n_u64(uint64x2_t a) { return vqshlq_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// int8x8_t test_vqshrn_n_s16(int16x8_t a) { return vqshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// int16x4_t test_vqshrn_n_s32(int32x4_t a) { return vqshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// int32x2_t test_vqshrn_n_s64(int64x2_t a) { return vqshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRN_N1]] +// uint8x8_t test_vqshrn_n_u16(uint16x8_t a) { return vqshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRN_N1]] +// uint16x4_t test_vqshrn_n_u32(uint32x4_t a) { return vqshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vqshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRN_N1]] +// uint32x2_t test_vqshrn_n_u64(uint64x2_t a) { return vqshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VQSHRUN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VQSHRUN_N1]] +// uint8x8_t test_vqshrun_n_s16(int16x8_t a) { return vqshrun_n_s16(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VQSHRUN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VQSHRUN_N1]] +// uint16x4_t test_vqshrun_n_s32(int32x4_t a) { return vqshrun_n_s32(a, 1); } -// CHECK-LABEL: @test_vqshrun_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VQSHRUN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VQSHRUN_N1]] +// uint32x2_t test_vqshrun_n_s64(int64x2_t a) { return vqshrun_n_s64(a, 1); } -// CHECK-LABEL: @test_vqsub_s8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) { return vqsub_s8(a, b); } -// CHECK-LABEL: @test_vqsub_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) { return vqsub_s16(a, b); } -// CHECK-LABEL: @test_vqsub_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) { return vqsub_s32(a, b); } -// CHECK-LABEL: @test_vqsub_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSUB_V2_I]] +// int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) { return vqsub_s64(a, b); } -// CHECK-LABEL: @test_vqsub_u8( -// CHECK: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VQSUB_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vqsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VQSUB_V_I]] +// uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) { return vqsub_u8(a, b); } -// CHECK-LABEL: @test_vqsub_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VQSUB_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vqsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VQSUB_V2_I]] +// uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) { return vqsub_u16(a, b); } -// CHECK-LABEL: @test_vqsub_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VQSUB_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vqsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VQSUB_V2_I]] +// uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) { return vqsub_u32(a, b); } -// CHECK-LABEL: @test_vqsub_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VQSUB_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vqsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VQSUB_V2_I]] +// uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) { return vqsub_u64(a, b); } -// CHECK-LABEL: @test_vqsubq_s8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) { return vqsubq_s8(a, b); } -// CHECK-LABEL: @test_vqsubq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) { return vqsubq_s16(a, b); } -// CHECK-LABEL: @test_vqsubq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) { return vqsubq_s32(a, b); } -// CHECK-LABEL: @test_vqsubq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSUBQ_V2_I]] +// int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) { return vqsubq_s64(a, b); } -// CHECK-LABEL: @test_vqsubq_u8( -// CHECK: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VQSUBQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VQSUBQ_V_I]] +// uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) { return vqsubq_u8(a, b); } -// CHECK-LABEL: @test_vqsubq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VQSUBQ_V2_I]] +// uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) { return vqsubq_u16(a, b); } -// CHECK-LABEL: @test_vqsubq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VQSUBQ_V2_I]] +// uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) { return vqsubq_u32(a, b); } -// CHECK-LABEL: @test_vqsubq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VQSUBQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VQSUBQ_V2_I]] +// uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) { return vqsubq_u64(a, b); } -// CHECK-LABEL: @test_vraddhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) { return vraddhn_s16(a, b); } -// CHECK-LABEL: @test_vraddhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRADDHN_V2_I]] +// int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) { return vraddhn_s32(a, b); } -// CHECK-LABEL: @test_vraddhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRADDHN_V2_I]] +// int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) { return vraddhn_s64(a, b); } -// CHECK-LABEL: @test_vraddhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRADDHN_V2_I]] +// uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) { return vraddhn_u16(a, b); } -// CHECK-LABEL: @test_vraddhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRADDHN_V2_I]] +// uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) { return vraddhn_u32(a, b); } -// CHECK-LABEL: @test_vraddhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRADDHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRADDHN_V2_I]] +// uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) { return vraddhn_u64(a, b); } -// CHECK-LABEL: @test_vrecpe_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRECPE_V1_I]] +// CHECK-LABEL: define <2 x float> @test_vrecpe_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRECPE_V1_I]] +// float32x2_t test_vrecpe_f32(float32x2_t a) { return vrecpe_f32(a); } -// CHECK-LABEL: @test_vrecpe_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRECPE_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VRECPE_V1_I]] +// uint32x2_t test_vrecpe_u32(uint32x2_t a) { return vrecpe_u32(a); } -// CHECK-LABEL: @test_vrecpeq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRECPEQ_V1_I]] +// float32x4_t test_vrecpeq_f32(float32x4_t a) { return vrecpeq_f32(a); } -// CHECK-LABEL: @test_vrecpeq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRECPEQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VRECPEQ_V1_I]] +// uint32x4_t test_vrecpeq_u32(uint32x4_t a) { return vrecpeq_u32(a); } -// CHECK-LABEL: @test_vrecps_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRECPS_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vrecps_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VRECPS_V2_I]] +// float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) { return vrecps_f32(a, b); } -// CHECK-LABEL: @test_vrecpsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRECPSQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VRECPSQ_V2_I]] +// float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) { return vrecpsq_f32(a, b); } -// CHECK-LABEL: @test_vreinterpret_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s16(int16x4_t a) { return vreinterpret_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s32(int32x2_t a) { return vreinterpret_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_s64(int64x1_t a) { return vreinterpret_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) { return vreinterpret_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) { return vreinterpret_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) { return vreinterpret_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) { return vreinterpret_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_f16(float16x4_t a) { return vreinterpret_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_f32(float32x2_t a) { return vreinterpret_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) { return vreinterpret_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) { return vreinterpret_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s8(int8x8_t a) { return vreinterpret_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s32(int32x2_t a) { return vreinterpret_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_s64(int64x1_t a) { return vreinterpret_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) { return vreinterpret_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) { return vreinterpret_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) { return vreinterpret_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) { return vreinterpret_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f16(float16x4_t a) { return vreinterpret_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_f32(float32x2_t a) { return vreinterpret_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) { return vreinterpret_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) { return vreinterpret_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s8(int8x8_t a) { return vreinterpret_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s16(int16x4_t a) { return vreinterpret_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_s64(int64x1_t a) { return vreinterpret_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) { return vreinterpret_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) { return vreinterpret_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) { return vreinterpret_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) { return vreinterpret_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f16(float16x4_t a) { return vreinterpret_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_f32(float32x2_t a) { return vreinterpret_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) { return vreinterpret_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) { return vreinterpret_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s8(int8x8_t a) { return vreinterpret_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s16(int16x4_t a) { return vreinterpret_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_s32(int32x2_t a) { return vreinterpret_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) { return vreinterpret_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) { return vreinterpret_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) { return vreinterpret_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_u64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) { return vreinterpret_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_f16(float16x4_t a) { return vreinterpret_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_f32(float32x2_t a) { return vreinterpret_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) { return vreinterpret_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) { return vreinterpret_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) { return vreinterpret_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) { return vreinterpret_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) { return vreinterpret_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) { return vreinterpret_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) { return vreinterpret_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) { return vreinterpret_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) { return vreinterpret_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) { return vreinterpret_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) { return vreinterpret_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) { return vreinterpret_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) { return vreinterpret_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) { return vreinterpret_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) { return vreinterpret_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) { return vreinterpret_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) { return vreinterpret_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) { return vreinterpret_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) { return vreinterpret_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) { return vreinterpret_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) { return vreinterpret_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) { return vreinterpret_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) { return vreinterpret_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u16_p16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) { return vreinterpret_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) { return vreinterpret_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) { return vreinterpret_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s32( -// CHECK: ret <2 x i32> %a +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i32> [[A]] +// uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) { return vreinterpret_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) { return vreinterpret_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) { return vreinterpret_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) { return vreinterpret_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) { return vreinterpret_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) { return vreinterpret_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) { return vreinterpret_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) { return vreinterpret_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32> -// CHECK: ret <2 x i32> [[TMP0]] +// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) { return vreinterpret_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) { return vreinterpret_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) { return vreinterpret_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) { return vreinterpret_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_s64( -// CHECK: ret <1 x i64> %a +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <1 x i64> [[A]] +// uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) { return vreinterpret_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) { return vreinterpret_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) { return vreinterpret_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) { return vreinterpret_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) { return vreinterpret_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpret_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) { return vreinterpret_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) { return vreinterpret_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpret_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64> -// CHECK: ret <1 x i64> [[TMP0]] +// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <1 x i64> +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) { return vreinterpret_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s8(int8x8_t a) { return vreinterpret_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s16(int16x4_t a) { return vreinterpret_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s32(int32x2_t a) { return vreinterpret_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_s64(int64x1_t a) { return vreinterpret_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) { return vreinterpret_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) { return vreinterpret_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) { return vreinterpret_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) { return vreinterpret_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_f32(float32x2_t a) { return vreinterpret_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) { return vreinterpret_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half> -// CHECK: ret <4 x half> [[TMP0]] +// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP0]] +// float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) { return vreinterpret_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s8(int8x8_t a) { return vreinterpret_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s16(int16x4_t a) { return vreinterpret_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s32(int32x2_t a) { return vreinterpret_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_s64(int64x1_t a) { return vreinterpret_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) { return vreinterpret_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) { return vreinterpret_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) { return vreinterpret_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpret_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) { return vreinterpret_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpret_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_f16(float16x4_t a) { return vreinterpret_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) { return vreinterpret_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpret_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float> -// CHECK: ret <2 x float> [[TMP0]] +// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) { return vreinterpret_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) { return vreinterpret_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) { return vreinterpret_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) { return vreinterpret_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) { return vreinterpret_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u8( -// CHECK: ret <8 x i8> %a +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[A]] +// poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) { return vreinterpret_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) { return vreinterpret_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) { return vreinterpret_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) { return vreinterpret_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) { return vreinterpret_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) { return vreinterpret_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) { return vreinterpret_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) { return vreinterpret_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) { return vreinterpret_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) { return vreinterpret_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) { return vreinterpret_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) { return vreinterpret_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u16( -// CHECK: ret <4 x i16> %a +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i16> [[A]] +// poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) { return vreinterpret_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) { return vreinterpret_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) { return vreinterpret_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16( +// CHECK-SAME: <4 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) { return vreinterpret_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpret_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) { return vreinterpret_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpret_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16> -// CHECK: ret <4 x i16> [[TMP0]] +// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[A]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) { return vreinterpret_p16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) { return vreinterpretq_s8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) { return vreinterpretq_s8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) { return vreinterpretq_s8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) { return vreinterpretq_s8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) { return vreinterpretq_s8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) { return vreinterpretq_s8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) { return vreinterpretq_s8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) { return vreinterpretq_s8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) { return vreinterpretq_s8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) { return vreinterpretq_s8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) { return vreinterpretq_s8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) { return vreinterpretq_s16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) { return vreinterpretq_s16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) { return vreinterpretq_s16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) { return vreinterpretq_s16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) { return vreinterpretq_s16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) { return vreinterpretq_s16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) { return vreinterpretq_s16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) { return vreinterpretq_s16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) { return vreinterpretq_s16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) { return vreinterpretq_s16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) { return vreinterpretq_s16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) { return vreinterpretq_s32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) { return vreinterpretq_s32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) { return vreinterpretq_s32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) { return vreinterpretq_s32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) { return vreinterpretq_s32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) { return vreinterpretq_s32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) { return vreinterpretq_s32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) { return vreinterpretq_s32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) { return vreinterpretq_s32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) { return vreinterpretq_s32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) { return vreinterpretq_s32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) { return vreinterpretq_s64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) { return vreinterpretq_s64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) { return vreinterpretq_s64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) { return vreinterpretq_s64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) { return vreinterpretq_s64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) { return vreinterpretq_s64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_u64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) { return vreinterpretq_s64_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) { return vreinterpretq_s64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) { return vreinterpretq_s64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) { return vreinterpretq_s64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_s64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) { return vreinterpretq_s64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) { return vreinterpretq_u8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) { return vreinterpretq_u8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) { return vreinterpretq_u8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) { return vreinterpretq_u8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) { return vreinterpretq_u8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) { return vreinterpretq_u8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) { return vreinterpretq_u8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) { return vreinterpretq_u8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) { return vreinterpretq_u8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) { return vreinterpretq_u8_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) { return vreinterpretq_u8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) { return vreinterpretq_u16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) { return vreinterpretq_u16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) { return vreinterpretq_u16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) { return vreinterpretq_u16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) { return vreinterpretq_u16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) { return vreinterpretq_u16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) { return vreinterpretq_u16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) { return vreinterpretq_u16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) { return vreinterpretq_u16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) { return vreinterpretq_u16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u16_p16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) { return vreinterpretq_u16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) { return vreinterpretq_u32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) { return vreinterpretq_u32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s32( -// CHECK: ret <4 x i32> %a +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <4 x i32> [[A]] +// uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) { return vreinterpretq_u32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) { return vreinterpretq_u32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) { return vreinterpretq_u32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) { return vreinterpretq_u32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) { return vreinterpretq_u32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) { return vreinterpretq_u32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) { return vreinterpretq_u32_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) { return vreinterpretq_u32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32> -// CHECK: ret <4 x i32> [[TMP0]] +// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) { return vreinterpretq_u32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) { return vreinterpretq_u64_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) { return vreinterpretq_u64_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) { return vreinterpretq_u64_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_s64( -// CHECK: ret <2 x i64> %a +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <2 x i64> [[A]] +// uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) { return vreinterpretq_u64_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) { return vreinterpretq_u64_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) { return vreinterpretq_u64_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) { return vreinterpretq_u64_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) { return vreinterpretq_u64_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) { return vreinterpretq_u64_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) { return vreinterpretq_u64_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_u64_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64> -// CHECK: ret <2 x i64> [[TMP0]] +// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) { return vreinterpretq_u64_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) { return vreinterpretq_f16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) { return vreinterpretq_f16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) { return vreinterpretq_f16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) { return vreinterpretq_f16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) { return vreinterpretq_f16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) { return vreinterpretq_f16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) { return vreinterpretq_f16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) { return vreinterpretq_f16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) { return vreinterpretq_f16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) { return vreinterpretq_f16_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f16_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half> -// CHECK: ret <8 x half> [[TMP0]] +// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) { return vreinterpretq_f16_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) { return vreinterpretq_f32_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) { return vreinterpretq_f32_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) { return vreinterpretq_f32_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) { return vreinterpretq_f32_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) { return vreinterpretq_f32_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) { return vreinterpretq_f32_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) { return vreinterpretq_f32_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) { return vreinterpretq_f32_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) { return vreinterpretq_f32_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) { return vreinterpretq_f32_p8(a); } -// CHECK-LABEL: @test_vreinterpretq_f32_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float> -// CHECK: ret <4 x float> [[TMP0]] +// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) { return vreinterpretq_f32_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) { return vreinterpretq_p8_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) { return vreinterpretq_p8_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) { return vreinterpretq_p8_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) { return vreinterpretq_p8_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u8( -// CHECK: ret <16 x i8> %a +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[A]] +// poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) { return vreinterpretq_p8_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) { return vreinterpretq_p8_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) { return vreinterpretq_p8_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) { return vreinterpretq_p8_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) { return vreinterpretq_p8_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) { return vreinterpretq_p8_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p8_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) { return vreinterpretq_p8_p16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) { return vreinterpretq_p16_s8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) { return vreinterpretq_p16_s16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) { return vreinterpretq_p16_s32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) { return vreinterpretq_p16_s64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) { return vreinterpretq_p16_u8(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u16( -// CHECK: ret <8 x i16> %a +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i16> [[A]] +// poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) { return vreinterpretq_p16_u16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) { return vreinterpretq_p16_u32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) { return vreinterpretq_p16_u64(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16( +// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) { return vreinterpretq_p16_f16(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) { return vreinterpretq_p16_f32(a); } -// CHECK-LABEL: @test_vreinterpretq_p16_p8( -// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16> -// CHECK: ret <8 x i16> [[TMP0]] +// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) { return vreinterpretq_p16_p8(a); } -// CHECK-LABEL: @test_vrev16_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev16_s8(int8x8_t a) { return vrev16_s8(a); } -// CHECK-LABEL: @test_vrev16_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev16_u8(uint8x8_t a) { return vrev16_u8(a); } -// CHECK-LABEL: @test_vrev16_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev16_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev16_p8(poly8x8_t a) { return vrev16_p8(a); } -// CHECK-LABEL: @test_vrev16q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev16q_s8(int8x16_t a) { return vrev16q_s8(a); } -// CHECK-LABEL: @test_vrev16q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev16q_u8(uint8x16_t a) { return vrev16q_u8(a); } -// CHECK-LABEL: @test_vrev16q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev16q_p8(poly8x16_t a) { return vrev16q_p8(a); } -// CHECK-LABEL: @test_vrev32_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev32_s8(int8x8_t a) { return vrev32_s8(a); } -// CHECK-LABEL: @test_vrev32_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev32_s16(int16x4_t a) { return vrev32_s16(a); } -// CHECK-LABEL: @test_vrev32_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev32_u8(uint8x8_t a) { return vrev32_u8(a); } -// CHECK-LABEL: @test_vrev32_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev32_u16(uint16x4_t a) { return vrev32_u16(a); } -// CHECK-LABEL: @test_vrev32_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev32_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev32_p8(poly8x8_t a) { return vrev32_p8(a); } -// CHECK-LABEL: @test_vrev32_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev32_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev32_p16(poly16x4_t a) { return vrev32_p16(a); } -// CHECK-LABEL: @test_vrev32q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev32q_s8(int8x16_t a) { return vrev32q_s8(a); } -// CHECK-LABEL: @test_vrev32q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev32q_s16(int16x8_t a) { return vrev32q_s16(a); } -// CHECK-LABEL: @test_vrev32q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev32q_u8(uint8x16_t a) { return vrev32q_u8(a); } -// CHECK-LABEL: @test_vrev32q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev32q_u16(uint16x8_t a) { return vrev32q_u16(a); } -// CHECK-LABEL: @test_vrev32q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev32q_p8(poly8x16_t a) { return vrev32q_p8(a); } -// CHECK-LABEL: @test_vrev32q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev32q_p16(poly16x8_t a) { return vrev32q_p16(a); } -// CHECK-LABEL: @test_vrev64_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// int8x8_t test_vrev64_s8(int8x8_t a) { return vrev64_s8(a); } -// CHECK-LABEL: @test_vrev64_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// int16x4_t test_vrev64_s16(int16x4_t a) { return vrev64_s16(a); } -// CHECK-LABEL: @test_vrev64_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vrev64_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// int32x2_t test_vrev64_s32(int32x2_t a) { return vrev64_s32(a); } -// CHECK-LABEL: @test_vrev64_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// uint8x8_t test_vrev64_u8(uint8x8_t a) { return vrev64_u8(a); } -// CHECK-LABEL: @test_vrev64_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// uint16x4_t test_vrev64_u16(uint16x4_t a) { return vrev64_u16(a); } -// CHECK-LABEL: @test_vrev64_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x i32> @test_vrev64_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> poison, <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[SHUFFLE_I]] +// uint32x2_t test_vrev64_u32(uint32x2_t a) { return vrev64_u32(a); } -// CHECK-LABEL: @test_vrev64_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> -// CHECK: ret <8 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i8> @test_vrev64_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// poly8x8_t test_vrev64_p8(poly8x8_t a) { return vrev64_p8(a); } -// CHECK-LABEL: @test_vrev64_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i16> @test_vrev64_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i16> [[SHUFFLE_I]] +// poly16x4_t test_vrev64_p16(poly16x4_t a) { return vrev64_p16(a); } -// CHECK-LABEL: @test_vrev64_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <2 x float> @test_vrev64_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> poison, <2 x i32> +// CHECK-NEXT: ret <2 x float> [[SHUFFLE_I]] +// float32x2_t test_vrev64_f32(float32x2_t a) { return vrev64_f32(a); } -// CHECK-LABEL: @test_vrev64q_s8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// int8x16_t test_vrev64q_s8(int8x16_t a) { return vrev64q_s8(a); } -// CHECK-LABEL: @test_vrev64q_s16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// int16x8_t test_vrev64q_s16(int16x8_t a) { return vrev64q_s16(a); } -// CHECK-LABEL: @test_vrev64q_s32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// int32x4_t test_vrev64q_s32(int32x4_t a) { return vrev64q_s32(a); } -// CHECK-LABEL: @test_vrev64q_u8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// uint8x16_t test_vrev64q_u8(uint8x16_t a) { return vrev64q_u8(a); } -// CHECK-LABEL: @test_vrev64q_u16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// uint16x8_t test_vrev64q_u16(uint16x8_t a) { return vrev64q_u16(a); } -// CHECK-LABEL: @test_vrev64q_u32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[SHUFFLE_I]] +// uint32x4_t test_vrev64q_u32(uint32x4_t a) { return vrev64q_u32(a); } -// CHECK-LABEL: @test_vrev64q_p8( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> -// CHECK: ret <16 x i8> [[SHUFFLE_I]] +// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// poly8x16_t test_vrev64q_p8(poly8x16_t a) { return vrev64q_p8(a); } -// CHECK-LABEL: @test_vrev64q_p16( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE_I]] +// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> +// CHECK-NEXT: ret <8 x i16> [[SHUFFLE_I]] +// poly16x8_t test_vrev64q_p16(poly16x8_t a) { return vrev64q_p16(a); } -// CHECK-LABEL: @test_vrev64q_f32( -// CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE_I]] +// CHECK-LABEL: define <4 x float> @test_vrev64q_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +// CHECK-NEXT: ret <4 x float> [[SHUFFLE_I]] +// float32x4_t test_vrev64q_f32(float32x4_t a) { return vrev64q_f32(a); } -// CHECK-LABEL: @test_vrhadd_s8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) { return vrhadd_s8(a, b); } -// CHECK-LABEL: @test_vrhadd_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRHADD_V2_I]] +// int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) { return vrhadd_s16(a, b); } -// CHECK-LABEL: @test_vrhadd_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRHADD_V2_I]] +// int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) { return vrhadd_s32(a, b); } -// CHECK-LABEL: @test_vrhadd_u8( -// CHECK: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRHADD_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRHADD_V_I]] +// uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) { return vrhadd_u8(a, b); } -// CHECK-LABEL: @test_vrhadd_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRHADD_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRHADD_V2_I]] +// uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) { return vrhadd_u16(a, b); } -// CHECK-LABEL: @test_vrhadd_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRHADD_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRHADD_V2_I]] +// uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) { return vrhadd_u32(a, b); } -// CHECK-LABEL: @test_vrhaddq_s8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) { return vrhaddq_s8(a, b); } -// CHECK-LABEL: @test_vrhaddq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRHADDQ_V2_I]] +// int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) { return vrhaddq_s16(a, b); } -// CHECK-LABEL: @test_vrhaddq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRHADDQ_V2_I]] +// int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) { return vrhaddq_s32(a, b); } -// CHECK-LABEL: @test_vrhaddq_u8( -// CHECK: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRHADDQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRHADDQ_V_I]] +// uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) { return vrhaddq_u8(a, b); } -// CHECK-LABEL: @test_vrhaddq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRHADDQ_V2_I]] +// uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) { return vrhaddq_u16(a, b); } -// CHECK-LABEL: @test_vrhaddq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRHADDQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRHADDQ_V2_I]] +// uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) { return vrhaddq_u32(a, b); } -// CHECK-LABEL: @test_vrshl_s8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) { return vrshl_s8(a, b); } -// CHECK-LABEL: @test_vrshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSHL_V2_I]] +// int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) { return vrshl_s16(a, b); } -// CHECK-LABEL: @test_vrshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSHL_V2_I]] +// int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) { return vrshl_s32(a, b); } -// CHECK-LABEL: @test_vrshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vrshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VRSHL_V2_I]] +// int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) { return vrshl_s64(a, b); } -// CHECK-LABEL: @test_vrshl_u8( -// CHECK: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VRSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vrshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSHL_V_I]] +// uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) { return vrshl_u8(a, b); } -// CHECK-LABEL: @test_vrshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSHL_V2_I]] +// uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) { return vrshl_u16(a, b); } -// CHECK-LABEL: @test_vrshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSHL_V2_I]] +// uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) { return vrshl_u32(a, b); } -// CHECK-LABEL: @test_vrshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VRSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vrshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VRSHL_V2_I]] +// uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) { return vrshl_u64(a, b); } -// CHECK-LABEL: @test_vrshlq_s8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) { return vrshlq_s8(a, b); } -// CHECK-LABEL: @test_vrshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRSHLQ_V2_I]] +// int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) { return vrshlq_s16(a, b); } -// CHECK-LABEL: @test_vrshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRSHLQ_V2_I]] +// int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) { return vrshlq_s32(a, b); } -// CHECK-LABEL: @test_vrshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VRSHLQ_V2_I]] +// int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) { return vrshlq_s64(a, b); } -// CHECK-LABEL: @test_vrshlq_u8( -// CHECK: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VRSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VRSHLQ_V_I]] +// uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) { return vrshlq_u8(a, b); } -// CHECK-LABEL: @test_vrshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VRSHLQ_V2_I]] +// uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) { return vrshlq_u16(a, b); } -// CHECK-LABEL: @test_vrshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VRSHLQ_V2_I]] +// uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) { return vrshlq_u32(a, b); } -// CHECK-LABEL: @test_vrshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VRSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VRSHLQ_V2_I]] +// uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) { return vrshlq_u64(a, b); } -// CHECK-LABEL: @test_vrshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// int8x8_t test_vrshrn_n_s16(int16x8_t a) { return vrshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// int16x4_t test_vrshrn_n_s32(int32x4_t a) { return vrshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// int32x2_t test_vrshrn_n_s64(int64x2_t a) { return vrshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i8> [[VRSHRN_N1]] +// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHRN_N1]] +// uint8x8_t test_vrshrn_n_u16(uint16x8_t a) { return vrshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i16> [[VRSHRN_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHRN_N1]] +// uint16x4_t test_vrshrn_n_u32(uint32x4_t a) { return vrshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i32> [[VRSHRN_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHRN_N1]] +// uint32x2_t test_vrshrn_n_u64(uint64x2_t a) { return vrshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// int8x8_t test_vrshr_n_s8(int8x8_t a) { return vrshr_n_s8(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// int16x4_t test_vrshr_n_s16(int16x4_t a) { return vrshr_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// int32x2_t test_vrshr_n_s32(int32x2_t a) { return vrshr_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// int64x1_t test_vrshr_n_s64(int64x1_t a) { return vrshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[A]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VRSHR_N]] +// uint8x8_t test_vrshr_n_u8(uint8x8_t a) { return vrshr_n_u8(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[A]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VRSHR_N1]] +// uint16x4_t test_vrshr_n_u16(uint16x4_t a) { return vrshr_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[A]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VRSHR_N1]] +// uint32x2_t test_vrshr_n_u32(uint32x2_t a) { return vrshr_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[A]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VRSHR_N1]] +// uint64x1_t test_vrshr_n_u64(uint64x1_t a) { return vrshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// int8x16_t test_vrshrq_n_s8(int8x16_t a) { return vrshrq_n_s8(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// int16x8_t test_vrshrq_n_s16(int16x8_t a) { return vrshrq_n_s16(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// int32x4_t test_vrshrq_n_s32(int32x4_t a) { return vrshrq_n_s32(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// int64x2_t test_vrshrq_n_s64(int64x2_t a) { return vrshrq_n_s64(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u8( -// CHECK: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VRSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[A]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VRSHR_N]] +// uint8x16_t test_vrshrq_n_u8(uint8x16_t a) { return vrshrq_n_u8(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VRSHR_N1]] +// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[A]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VRSHR_N1]] +// uint16x8_t test_vrshrq_n_u16(uint16x8_t a) { return vrshrq_n_u16(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VRSHR_N1]] +// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[A]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VRSHR_N1]] +// uint32x4_t test_vrshrq_n_u32(uint32x4_t a) { return vrshrq_n_u32(a, 1); } -// CHECK-LABEL: @test_vrshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VRSHR_N1]] +// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[A]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VRSHR_N1]] +// uint64x2_t test_vrshrq_n_u64(uint64x2_t a) { return vrshrq_n_u64(a, 1); } -// CHECK-LABEL: @test_vrsqrte_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %a) -// CHECK: ret <2 x float> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[A]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTE_V1_I]] +// float32x2_t test_vrsqrte_f32(float32x2_t a) { return vrsqrte_f32(a); } -// CHECK-LABEL: @test_vrsqrte_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %a) -// CHECK: ret <2 x i32> [[VRSQRTE_V1_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[A]]) +// CHECK-NEXT: ret <2 x i32> [[VRSQRTE_V1_I]] +// uint32x2_t test_vrsqrte_u32(uint32x2_t a) { return vrsqrte_u32(a); } -// CHECK-LABEL: @test_vrsqrteq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %a) -// CHECK: ret <4 x float> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[A]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTEQ_V1_I]] +// float32x4_t test_vrsqrteq_f32(float32x4_t a) { return vrsqrteq_f32(a); } -// CHECK-LABEL: @test_vrsqrteq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %a) -// CHECK: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[A]]) +// CHECK-NEXT: ret <4 x i32> [[VRSQRTEQ_V1_I]] +// uint32x4_t test_vrsqrteq_u32(uint32x4_t a) { return vrsqrteq_u32(a); } -// CHECK-LABEL: @test_vrsqrts_f32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %a, <2 x float> %b) -// CHECK: [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8> -// CHECK: ret <2 x float> [[VRSQRTS_V2_I]] +// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[A]], <2 x float> [[B]]) +// CHECK-NEXT: ret <2 x float> [[VRSQRTS_V2_I]] +// float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) { return vrsqrts_f32(a, b); } -// CHECK-LABEL: @test_vrsqrtsq_f32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %a, <4 x float> %b) -// CHECK: [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x float> [[VRSQRTSQ_V2_I]] +// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[A]], <4 x float> [[B]]) +// CHECK-NEXT: ret <4 x float> [[VRSQRTSQ_V2_I]] +// float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) { return vrsqrtsq_f32(a, b); } -// CHECK-LABEL: @test_vrsra_n_s8( -// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]] -// CHECK: ret <8 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i8> [[VRSRA_N]] +// int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) { return vrsra_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i16> [[VRSRA_N]] +// int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) { return vrsra_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i32> [[VRSRA_N]] +// int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) { return vrsra_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <1 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <1 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <1 x i64> [[VRSRA_N]] +// int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) { return vrsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u8( -// CHECK: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]] -// CHECK: ret <8 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i8> [[VRSRA_N]] +// uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) { return vrsra_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[B]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i16> [[VRSRA_N]] +// uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) { return vrsra_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[B]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i32> [[VRSRA_N]] +// uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) { return vrsra_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vrsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <1 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <1 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <1 x i64> [[VRSRA_N]] +// uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) { return vrsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s8( -// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]] -// CHECK: ret <16 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <16 x i8> [[VRSRA_N]] +// int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) { return vrsraq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <8 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[VRSRA_N]] +// int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) { return vrsraq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[VRSRA_N]] +// int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) { return vrsraq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[VRSRA_N]] +// int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) { return vrsraq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u8( -// CHECK: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]] -// CHECK: ret <16 x i8> [[VRSRA_N]] +// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <16 x i8> [[A]], [[TMP0]] +// CHECK-NEXT: ret <16 x i8> [[VRSRA_N]] +// uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vrsraq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> splat (i16 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]] -// CHECK: ret <8 x i16> [[VRSRA_N]] +// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[B]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <8 x i16> [[A]], [[TMP0]] +// CHECK-NEXT: ret <8 x i16> [[VRSRA_N]] +// uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vrsraq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> splat (i32 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -// CHECK: ret <4 x i32> [[VRSRA_N]] +// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[B]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <4 x i32> [[A]], [[TMP0]] +// CHECK-NEXT: ret <4 x i32> [[VRSRA_N]] +// uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vrsraq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vrsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> splat (i64 -1)) -// CHECK: [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] -// CHECK: ret <2 x i64> [[VRSRA_N]] +// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[B]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: [[VRSRA_N:%.*]] = add <2 x i64> [[A]], [[TMP0]] +// CHECK-NEXT: ret <2 x i64> [[VRSRA_N]] +// uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vrsraq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vrsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) { return vrsubhn_s16(a, b); } -// CHECK-LABEL: @test_vrsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSUBHN_V2_I]] +// int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) { return vrsubhn_s32(a, b); } -// CHECK-LABEL: @test_vrsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSUBHN_V2_I]] +// int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) { return vrsubhn_s64(a, b); } -// CHECK-LABEL: @test_vrsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) -// CHECK: ret <8 x i8> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VRSUBHN_V2_I]] +// uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) { return vrsubhn_u16(a, b); } -// CHECK-LABEL: @test_vrsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VRSUBHN_V2_I]] +// uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) { return vrsubhn_u32(a, b); } -// CHECK-LABEL: @test_vrsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VRSUBHN_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VRSUBHN_V2_I]] +// uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) { return vrsubhn_u64(a, b); } -// CHECK-LABEL: @test_vset_lane_u8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { return vset_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_u16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) { return vset_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_u32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VSET_LANE]] +// uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) { return vset_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_s8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) { return vset_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_s16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) { return vset_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_s32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i32> %b, i32 %a, i32 1 -// CHECK: ret <2 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i32> [[B]], i32 [[A]], i64 1 +// CHECK-NEXT: ret <2 x i32> [[VSET_LANE]] +// int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) { return vset_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_p8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 -// CHECK: ret <8 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) { return vset_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vset_lane_p16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> %b, i16 %a, i32 3 -// CHECK: ret <4 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[B]], i16 [[A]], i64 3 +// CHECK-NEXT: ret <4 x i16> [[VSET_LANE]] +// poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) { return vset_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vset_lane_f32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x float> %b, float %a, i32 1 -// CHECK: ret <2 x float> [[VSET_LANE]] +// CHECK-LABEL: define <2 x float> @test_vset_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x float> [[B]], float [[A]], i64 1 +// CHECK-NEXT: ret <2 x float> [[VSET_LANE]] +// float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) { return vset_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vset_lane_f16( -// CHECK: [[__REINT_246:%.*]] = alloca half, align 2 -// CHECK: [[__REINT1_246:%.*]] = alloca <4 x half>, align 8 -// CHECK: [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8 -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: store half [[TMP0]], ptr [[__REINT_246]], align 2 -// CHECK: store <4 x half> %b, ptr [[__REINT1_246]], align 8 -// CHECK: [[TMP2:%.*]] = load i16, ptr [[__REINT_246]], align 2 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[__REINT1_246]], align 8 -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[TMP2]], i32 1 -// CHECK: store <4 x i16> [[VSET_LANE]], ptr [[__REINT2_246]], align 8 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[__REINT2_246]], align 8 -// CHECK: ret <4 x half> [[TMP8]] +// CHECK-LABEL: define <4 x half> @test_vset_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <4 x i16> +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[TMP0]], i64 1 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VSET_LANE]] to <4 x half> +// CHECK-NEXT: ret <4 x half> [[TMP2]] +// float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) { return vset_lane_f16(*a, b, 1); } -// CHECK-LABEL: @test_vsetq_lane_u8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8( +// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) { return vsetq_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_u16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16( +// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) { return vsetq_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_u32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VSET_LANE]] +// uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) { return vsetq_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_s8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) { return vsetq_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_s16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) { return vsetq_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_s32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x i32> %b, i32 %a, i32 3 -// CHECK: ret <4 x i32> [[VSET_LANE]] +// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32( +// CHECK-SAME: i32 noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i32> [[B]], i32 [[A]], i64 3 +// CHECK-NEXT: ret <4 x i32> [[VSET_LANE]] +// int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) { return vsetq_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_p8( -// CHECK: [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15 -// CHECK: ret <16 x i8> [[VSET_LANE]] +// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8( +// CHECK-SAME: i8 noundef signext [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[A]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) { return vsetq_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vsetq_lane_p16( -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> %b, i16 %a, i32 7 -// CHECK: ret <8 x i16> [[VSET_LANE]] +// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16( +// CHECK-SAME: i16 noundef signext [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[B]], i16 [[A]], i64 7 +// CHECK-NEXT: ret <8 x i16> [[VSET_LANE]] +// poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) { return vsetq_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vsetq_lane_f32( -// CHECK: [[VSET_LANE:%.*]] = insertelement <4 x float> %b, float %a, i32 3 -// CHECK: ret <4 x float> [[VSET_LANE]] +// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32( +// CHECK-SAME: float noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x float> [[B]], float [[A]], i64 3 +// CHECK-NEXT: ret <4 x float> [[VSET_LANE]] +// float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) { return vsetq_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vsetq_lane_f16( -// CHECK: [[__REINT_248:%.*]] = alloca half, align 2 -// CHECK: [[__REINT1_248:%.*]] = alloca <8 x half>, align 16 -// CHECK: [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16 -// CHECK: [[TMP0:%.*]] = load half, ptr %a, align 2 -// CHECK: store half [[TMP0]], ptr [[__REINT_248]], align 2 -// CHECK: store <8 x half> %b, ptr [[__REINT1_248]], align 16 -// CHECK: [[TMP2:%.*]] = load i16, ptr [[__REINT_248]], align 2 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[__REINT1_248]], align 16 -// CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[TMP2]], i32 3 -// CHECK: store <8 x i16> [[VSET_LANE]], ptr [[__REINT2_248]], align 16 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[__REINT2_248]], align 16 -// CHECK: ret <8 x half> [[TMP8]] +// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <8 x i16> +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[TMP0]], i64 3 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[VSET_LANE]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) { return vsetq_lane_f16(*a, b, 3); } -// CHECK-LABEL: @test_vset_lane_s64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) { return vset_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vset_lane_u64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <1 x i64> %b, i64 %a, i32 0 -// CHECK: ret <1 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> poison, i64 [[A]], i64 0 +// CHECK-NEXT: ret <1 x i64> [[VSET_LANE]] +// uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) { return vset_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vsetq_lane_s64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) { return vsetq_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vsetq_lane_u64( -// CHECK: [[VSET_LANE:%.*]] = insertelement <2 x i64> %b, i64 %a, i32 1 -// CHECK: ret <2 x i64> [[VSET_LANE]] +// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[B]], i64 [[A]], i64 1 +// CHECK-NEXT: ret <2 x i64> [[VSET_LANE]] +// uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) { return vsetq_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vshl_s8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vshl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) { return vshl_s8(a, b); } -// CHECK-LABEL: @test_vshl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vshl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VSHL_V2_I]] +// int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) { return vshl_s16(a, b); } -// CHECK-LABEL: @test_vshl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vshl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VSHL_V2_I]] +// int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) { return vshl_s32(a, b); } -// CHECK-LABEL: @test_vshl_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vshl_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VSHL_V2_I]] +// int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) { return vshl_s64(a, b); } -// CHECK-LABEL: @test_vshl_u8( -// CHECK: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VSHL_V_I]] +// CHECK-LABEL: define <8 x i8> @test_vshl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VSHL_V_I]] +// uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) { return vshl_u8(a, b); } -// CHECK-LABEL: @test_vshl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %a, <4 x i16> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <4 x i16> [[VSHL_V2_I]] +// CHECK-LABEL: define <4 x i16> @test_vshl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[A]], <4 x i16> [[B]]) +// CHECK-NEXT: ret <4 x i16> [[VSHL_V2_I]] +// uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) { return vshl_u16(a, b); } -// CHECK-LABEL: @test_vshl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %a, <2 x i32> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <2 x i32> [[VSHL_V2_I]] +// CHECK-LABEL: define <2 x i32> @test_vshl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[A]], <2 x i32> [[B]]) +// CHECK-NEXT: ret <2 x i32> [[VSHL_V2_I]] +// uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) { return vshl_u32(a, b); } -// CHECK-LABEL: @test_vshl_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %a, <1 x i64> %b) -// CHECK: [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8> -// CHECK: ret <1 x i64> [[VSHL_V2_I]] +// CHECK-LABEL: define <1 x i64> @test_vshl_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[A]], <1 x i64> [[B]]) +// CHECK-NEXT: ret <1 x i64> [[VSHL_V2_I]] +// uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) { return vshl_u64(a, b); } -// CHECK-LABEL: @test_vshlq_s8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) { return vshlq_s8(a, b); } -// CHECK-LABEL: @test_vshlq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VSHLQ_V2_I]] +// int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) { return vshlq_s16(a, b); } -// CHECK-LABEL: @test_vshlq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VSHLQ_V2_I]] +// int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) { return vshlq_s32(a, b); } -// CHECK-LABEL: @test_vshlq_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VSHLQ_V2_I]] +// int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) { return vshlq_s64(a, b); } -// CHECK-LABEL: @test_vshlq_u8( -// CHECK: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) -// CHECK: ret <16 x i8> [[VSHLQ_V_I]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VSHLQ_V_I]] +// uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) { return vshlq_u8(a, b); } -// CHECK-LABEL: @test_vshlq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %a, <8 x i16> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <8 x i16> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[A]], <8 x i16> [[B]]) +// CHECK-NEXT: ret <8 x i16> [[VSHLQ_V2_I]] +// uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) { return vshlq_u16(a, b); } -// CHECK-LABEL: @test_vshlq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %a, <4 x i32> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <4 x i32> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[A]], <4 x i32> [[B]]) +// CHECK-NEXT: ret <4 x i32> [[VSHLQ_V2_I]] +// uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) { return vshlq_u32(a, b); } -// CHECK-LABEL: @test_vshlq_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %a, <2 x i64> %b) -// CHECK: [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8> -// CHECK: ret <2 x i64> [[VSHLQ_V2_I]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[A]], <2 x i64> [[B]]) +// CHECK-NEXT: ret <2 x i64> [[VSHLQ_V2_I]] +// uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) { return vshlq_u64(a, b); } -// CHECK-LABEL: @test_vshll_n_s8( -// CHECK: [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <8 x i16> [[TMP0]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// int16x8_t test_vshll_n_s8(int8x8_t a) { return vshll_n_s8(a, 1); } -// CHECK-LABEL: @test_vshll_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <4 x i32> [[TMP0]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// int32x4_t test_vshll_n_s16(int16x4_t a) { return vshll_n_s16(a, 1); } -// CHECK-LABEL: @test_vshll_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nsw <2 x i64> [[TMP0]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// int64x2_t test_vshll_n_s32(int32x2_t a) { return vshll_n_s32(a, 1); } -// CHECK-LABEL: @test_vshll_n_u8( -// CHECK: [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHLL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <8 x i16> [[TMP0]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHLL_N]] +// uint16x8_t test_vshll_n_u8(uint8x8_t a) { return vshll_n_u8(a, 1); } -// CHECK-LABEL: @test_vshll_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> -// CHECK: [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHLL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <4 x i32> [[TMP0]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHLL_N]] +// uint32x4_t test_vshll_n_u16(uint16x4_t a) { return vshll_n_u16(a, 1); } -// CHECK-LABEL: @test_vshll_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> -// CHECK: [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHLL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VSHLL_N:%.*]] = shl nuw nsw <2 x i64> [[TMP0]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHLL_N]] +// uint64x2_t test_vshll_n_u32(uint32x2_t a) { return vshll_n_u32(a, 1); } -// CHECK-LABEL: @test_vshl_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// int8x8_t test_vshl_n_s8(int8x8_t a) { return vshl_n_s8(a, 1); } -// CHECK-LABEL: @test_vshl_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// int16x4_t test_vshl_n_s16(int16x4_t a) { return vshl_n_s16(a, 1); } -// CHECK-LABEL: @test_vshl_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// int32x2_t test_vshl_n_s32(int32x2_t a) { return vshl_n_s32(a, 1); } -// CHECK-LABEL: @test_vshl_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// int64x1_t test_vshl_n_s64(int64x1_t a) { return vshl_n_s64(a, 1); } -// CHECK-LABEL: @test_vshl_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHL_N]] +// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHL_N]] +// uint8x8_t test_vshl_n_u8(uint8x8_t a) { return vshl_n_u8(a, 1); } -// CHECK-LABEL: @test_vshl_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHL_N]] +// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHL_N]] +// uint16x4_t test_vshl_n_u16(uint16x4_t a) { return vshl_n_u16(a, 1); } -// CHECK-LABEL: @test_vshl_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHL_N]] +// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHL_N]] +// uint32x2_t test_vshl_n_u32(uint32x2_t a) { return vshl_n_u32(a, 1); } -// CHECK-LABEL: @test_vshl_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHL_N]] +// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHL_N]] +// uint64x1_t test_vshl_n_u64(uint64x1_t a) { return vshl_n_u64(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// int8x16_t test_vshlq_n_s8(int8x16_t a) { return vshlq_n_s8(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// int16x8_t test_vshlq_n_s16(int16x8_t a) { return vshlq_n_s16(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// int32x4_t test_vshlq_n_s32(int32x4_t a) { return vshlq_n_s32(a, 1); } -// CHECK-LABEL: @test_vshlq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// int64x2_t test_vshlq_n_s64(int64x2_t a) { return vshlq_n_s64(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u8( -// CHECK: [[VSHL_N:%.*]] = shl <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHL_N]] +// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHL_N]] +// uint8x16_t test_vshlq_n_u8(uint8x16_t a) { return vshlq_n_u8(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHL_N]] +// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHL_N]] +// uint16x8_t test_vshlq_n_u16(uint16x8_t a) { return vshlq_n_u16(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHL_N]] +// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHL_N]] +// uint32x4_t test_vshlq_n_u32(uint32x4_t a) { return vshlq_n_u32(a, 1); } -// CHECK-LABEL: @test_vshlq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHL_N]] +// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHL_N:%.*]] = shl <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHL_N]] +// uint64x2_t test_vshlq_n_u64(uint64x2_t a) { return vshlq_n_u64(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// int8x8_t test_vshrn_n_s16(int16x8_t a) { return vshrn_n_s16(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// int16x4_t test_vshrn_n_s32(int32x4_t a) { return vshrn_n_s32(a, 1); } -// CHECK-LABEL: @test_vshrn_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// int32x2_t test_vshrn_n_s64(int64x2_t a) { return vshrn_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSHRN_N]] +// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP0]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSHRN_N]] +// uint8x8_t test_vshrn_n_u16(uint16x8_t a) { return vshrn_n_u16(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSHRN_N]] +// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP0]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSHRN_N]] +// uint16x4_t test_vshrn_n_u32(uint32x4_t a) { return vshrn_n_u32(a, 1); } -// CHECK-LABEL: @test_vshrn_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSHRN_N]] +// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = lshr <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP0]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSHRN_N]] +// uint32x2_t test_vshrn_n_u64(uint64x2_t a) { return vshrn_n_u64(a, 1); } -// CHECK-LABEL: @test_vshr_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// int8x8_t test_vshr_n_s8(int8x8_t a) { return vshr_n_s8(a, 1); } -// CHECK-LABEL: @test_vshr_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// int16x4_t test_vshr_n_s16(int16x4_t a) { return vshr_n_s16(a, 1); } -// CHECK-LABEL: @test_vshr_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// int32x2_t test_vshr_n_s32(int32x2_t a) { return vshr_n_s32(a, 1); } -// CHECK-LABEL: @test_vshr_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// int64x1_t test_vshr_n_s64(int64x1_t a) { return vshr_n_s64(a, 1); } -// CHECK-LABEL: @test_vshr_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i8> %a, splat (i8 1) -// CHECK: ret <8 x i8> [[VSHR_N]] +// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <8 x i8> [[VSHR_N]] +// uint8x8_t test_vshr_n_u8(uint8x8_t a) { return vshr_n_u8(a, 1); } -// CHECK-LABEL: @test_vshr_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <4 x i16> [[VSHR_N]] +// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <4 x i16> [[VSHR_N]] +// uint16x4_t test_vshr_n_u16(uint16x4_t a) { return vshr_n_u16(a, 1); } -// CHECK-LABEL: @test_vshr_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <2 x i32> [[VSHR_N]] +// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <2 x i32> [[VSHR_N]] +// uint32x2_t test_vshr_n_u32(uint32x2_t a) { return vshr_n_u32(a, 1); } -// CHECK-LABEL: @test_vshr_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <1 x i64> [[VSHR_N]] +// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <1 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <1 x i64> [[VSHR_N]] +// uint64x1_t test_vshr_n_u64(uint64x1_t a) { return vshr_n_u64(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s8( -// CHECK: [[VSHR_N:%.*]] = ashr <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// int8x16_t test_vshrq_n_s8(int8x16_t a) { return vshrq_n_s8(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// int16x8_t test_vshrq_n_s16(int16x8_t a) { return vshrq_n_s16(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// int32x4_t test_vshrq_n_s32(int32x4_t a) { return vshrq_n_s32(a, 1); } -// CHECK-LABEL: @test_vshrq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = ashr <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// int64x2_t test_vshrq_n_s64(int64x2_t a) { return vshrq_n_s64(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u8( -// CHECK: [[VSHR_N:%.*]] = lshr <16 x i8> %a, splat (i8 1) -// CHECK: ret <16 x i8> [[VSHR_N]] +// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <16 x i8> [[A]], splat (i8 1) +// CHECK-NEXT: ret <16 x i8> [[VSHR_N]] +// uint8x16_t test_vshrq_n_u8(uint8x16_t a) { return vshrq_n_u8(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], splat (i16 1) -// CHECK: ret <8 x i16> [[VSHR_N]] +// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <8 x i16> [[A]], splat (i16 1) +// CHECK-NEXT: ret <8 x i16> [[VSHR_N]] +// uint16x8_t test_vshrq_n_u16(uint16x8_t a) { return vshrq_n_u16(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], splat (i32 1) -// CHECK: ret <4 x i32> [[VSHR_N]] +// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <4 x i32> [[A]], splat (i32 1) +// CHECK-NEXT: ret <4 x i32> [[VSHR_N]] +// uint32x4_t test_vshrq_n_u32(uint32x4_t a) { return vshrq_n_u32(a, 1); } -// CHECK-LABEL: @test_vshrq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], splat (i64 1) -// CHECK: ret <2 x i64> [[VSHR_N]] +// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSHR_N:%.*]] = lshr <2 x i64> [[A]], splat (i64 1) +// CHECK-NEXT: ret <2 x i64> [[VSHR_N]] +// uint64x2_t test_vshrq_n_u64(uint64x2_t a) { return vshrq_n_u64(a, 1); } -// CHECK-LABEL: @test_vsli_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) { return vsli_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) { return vsli_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) { return vsli_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) { return vsli_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) { return vsli_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) { return vsli_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) { return vsli_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) { return vsli_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) { return vsli_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsli_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) { return vsli_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) { return vsliq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) { return vsliq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) { return vsliq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) { return vsliq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) { return vsliq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) { return vsliq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) { return vsliq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) { return vsliq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) { return vsliq_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsliq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) { return vsliq_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) { return vsra_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i16> [[B]], splat (i16 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) { return vsra_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i32> [[B]], splat (i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) { return vsra_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <1 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) { return vsra_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]] -// CHECK: ret <8 x i8> [[TMP0]] +// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) { return vsra_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i16> [[TMP4]] +// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i16> [[B]], splat (i16 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i16> [[TMP0]] +// uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) { return vsra_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i32> [[TMP4]] +// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i32> [[B]], splat (i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i32> [[TMP0]] +// uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) { return vsra_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsra_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <1 x i64> [[TMP4]] +// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <1 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <1 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <1 x i64> [[TMP0]] +// uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) { return vsra_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s8( -// CHECK: [[VSRA_N:%.*]] = ashr <16 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <16 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) { return vsraq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <8 x i16> [[B]], splat (i16 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) { return vsraq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <4 x i32> [[B]], splat (i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) { return vsraq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = ashr <2 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) { return vsraq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u8( -// CHECK: [[VSRA_N:%.*]] = lshr <16 x i8> %b, splat (i8 1) -// CHECK: [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]] -// CHECK: ret <16 x i8> [[TMP0]] +// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <16 x i8> [[B]], splat (i8 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <16 x i8> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) { return vsraq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], splat (i16 1) -// CHECK: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]] -// CHECK: ret <8 x i16> [[TMP4]] +// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <8 x i16> [[B]], splat (i16 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <8 x i16> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) { return vsraq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], splat (i32 1) -// CHECK: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]] -// CHECK: ret <4 x i32> [[TMP4]] +// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <4 x i32> [[B]], splat (i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <4 x i32> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) { return vsraq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsraq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], splat (i64 1) -// CHECK: [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]] -// CHECK: ret <2 x i64> [[TMP4]] +// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSRA_N:%.*]] = lshr <2 x i64> [[B]], splat (i64 1) +// CHECK-NEXT: [[TMP0:%.*]] = add <2 x i64> [[A]], [[VSRA_N]] +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) { return vsraq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) { return vsri_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) { return vsri_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) { return vsri_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) { return vsri_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) { return vsri_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) { return vsri_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> splat (i32 -1)) -// CHECK: ret <2 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <2 x i32> [[VSLI_N2]] +// uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) { return vsri_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> splat (i64 -1)) -// CHECK: ret <1 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[A]], <1 x i64> [[B]], <1 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <1 x i64> [[VSLI_N2]] +// uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) { return vsri_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> splat (i8 -1)) -// CHECK: ret <8 x i8> [[VSLI_N]] +// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <8 x i8> [[VSLI_N]] +// poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) { return vsri_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsri_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> splat (i16 -1)) -// CHECK: ret <4 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[A]], <4 x i16> [[B]], <4 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <4 x i16> [[VSLI_N2]] +// poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) { return vsri_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) { return vsriq_n_s8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) { return vsriq_n_s16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) { return vsriq_n_s32(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) { return vsriq_n_s64(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) { return vsriq_n_u8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) { return vsriq_n_u16(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> splat (i32 -1)) -// CHECK: ret <4 x i32> [[VSLI_N2]] +// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> splat (i32 -1)) +// CHECK-NEXT: ret <4 x i32> [[VSLI_N2]] +// uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) { return vsriq_n_u32(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> splat (i64 -1)) -// CHECK: ret <2 x i64> [[VSLI_N2]] +// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[A]], <2 x i64> [[B]], <2 x i64> splat (i64 -1)) +// CHECK-NEXT: ret <2 x i64> [[VSLI_N2]] +// uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) { return vsriq_n_u64(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_p8( -// CHECK: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 -1)) -// CHECK: ret <16 x i8> [[VSLI_N]] +// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> splat (i8 -1)) +// CHECK-NEXT: ret <16 x i8> [[VSLI_N]] +// poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) { return vsriq_n_p8(a, b, 1); } -// CHECK-LABEL: @test_vsriq_n_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> -// CHECK: [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> splat (i16 -1)) -// CHECK: ret <8 x i16> [[VSLI_N2]] +// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[A]], <8 x i16> [[B]], <8 x i16> splat (i16 -1)) +// CHECK-NEXT: ret <8 x i16> [[VSLI_N2]] +// poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) { return vsriq_n_p16(a, b, 1); } -// CHECK-LABEL: @test_vst1q_u8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_u8(uint8_t * a, uint8x16_t b) { vst1q_u8(a, b); } -// CHECK-LABEL: @test_vst1q_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_u16(uint16_t * a, uint16x8_t b) { vst1q_u16(a, b); } -// CHECK-LABEL: @test_vst1q_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_u32(uint32_t * a, uint32x4_t b) { vst1q_u32(a, b); } -// CHECK-LABEL: @test_vst1q_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_u64(uint64_t * a, uint64x2_t b) { vst1q_u64(a, b); } -// CHECK-LABEL: @test_vst1q_s8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_s8(int8_t * a, int8x16_t b) { vst1q_s8(a, b); } -// CHECK-LABEL: @test_vst1q_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_s16(int16_t * a, int16x8_t b) { vst1q_s16(a, b); } -// CHECK-LABEL: @test_vst1q_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i32(ptr %a, <4 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i32(ptr [[A]], <4 x i32> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_s32(int32_t * a, int32x4_t b) { vst1q_s32(a, b); } -// CHECK-LABEL: @test_vst1q_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i64(ptr %a, <2 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i64(ptr [[A]], <2 x i64> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_s64(int64_t * a, int64x2_t b) { vst1q_s64(a, b); } -// CHECK-LABEL: @test_vst1q_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8f16(ptr %a, <8 x half> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8f16(ptr [[A]], <8 x half> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_f16(float16_t * a, float16x8_t b) { vst1q_f16(a, b); } -// CHECK-LABEL: @test_vst1q_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4f32(ptr %a, <4 x float> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4f32(ptr [[A]], <4 x float> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_f32(float32_t * a, float32x4_t b) { vst1q_f32(a, b); } -// CHECK-LABEL: @test_vst1q_p8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v16i8(ptr %a, <16 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v16i8(ptr [[A]], <16 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1q_p8(poly8_t * a, poly8x16_t b) { vst1q_p8(a, b); } -// CHECK-LABEL: @test_vst1q_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i16(ptr %a, <8 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i16(ptr [[A]], <8 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1q_p16(poly16_t * a, poly16x8_t b) { vst1q_p16(a, b); } -// CHECK-LABEL: @test_vst1_u8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_u8(uint8_t * a, uint8x8_t b) { vst1_u8(a, b); } -// CHECK-LABEL: @test_vst1_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_u16(uint16_t * a, uint16x4_t b) { vst1_u16(a, b); } -// CHECK-LABEL: @test_vst1_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_u32(uint32_t * a, uint32x2_t b) { vst1_u32(a, b); } -// CHECK-LABEL: @test_vst1_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_u64(uint64_t * a, uint64x1_t b) { vst1_u64(a, b); } -// CHECK-LABEL: @test_vst1_s8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_s8(int8_t * a, int8x8_t b) { vst1_s8(a, b); } -// CHECK-LABEL: @test_vst1_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_s16(int16_t * a, int16x4_t b) { vst1_s16(a, b); } -// CHECK-LABEL: @test_vst1_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2i32(ptr %a, <2 x i32> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2i32(ptr [[A]], <2 x i32> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_s32(int32_t * a, int32x2_t b) { vst1_s32(a, b); } -// CHECK-LABEL: @test_vst1_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_s64(int64_t * a, int64x1_t b) { vst1_s64(a, b); } -// CHECK-LABEL: @test_vst1_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4f16(ptr %a, <4 x half> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4f16(ptr [[A]], <4 x half> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_f16(float16_t * a, float16x4_t b) { vst1_f16(a, b); } -// CHECK-LABEL: @test_vst1_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst1.p0.v2f32(ptr %a, <2 x float> [[TMP2]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v2f32(ptr [[A]], <2 x float> [[B]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1_f32(float32_t * a, float32x2_t b) { vst1_f32(a, b); } -// CHECK-LABEL: @test_vst1_p8( -// CHECK: call void @llvm.arm.neon.vst1.p0.v8i8(ptr %a, <8 x i8> %b, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v8i8(ptr [[A]], <8 x i8> [[B]], i32 1) +// CHECK-NEXT: ret void +// void test_vst1_p8(poly8_t * a, poly8x8_t b) { vst1_p8(a, b); } -// CHECK-LABEL: @test_vst1_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst1.p0.v4i16(ptr %a, <4 x i16> [[TMP2]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v4i16(ptr [[A]], <4 x i16> [[B]], i32 2) +// CHECK-NEXT: ret void +// void test_vst1_p16(poly16_t * a, poly16x4_t b) { vst1_p16(a, b); } -// CHECK-LABEL: @test_vst1q_lane_u8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) { vst1q_lane_u8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) { vst1q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) { vst1q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) { vst1q_lane_u64(a, b, 1); } -// CHECK-LABEL: @test_vst1q_lane_s8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s8(int8_t * a, int8x16_t b) { vst1q_lane_s8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s16(int16_t * a, int16x8_t b) { vst1q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[B]], i64 3 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_s32(int32_t * a, int32x4_t b) { vst1q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64> -// CHECK: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> -// CHECK: call void @llvm.arm.neon.vst1.p0.v1i64(ptr %a, <1 x i64> [[TMP3]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> poison, <1 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst1.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], i32 4) +// CHECK-NEXT: ret void +// void test_vst1q_lane_s64(int64_t * a, int64x2_t b) { vst1q_lane_s64(a, b, 1); } -// CHECK-LABEL: @test_vst1q_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half> -// CHECK: [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7 -// CHECK: store half [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[B]], i64 7 +// CHECK-NEXT: store half [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f16(float16_t * a, float16x8_t b) { vst1q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst1q_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> -// CHECK: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 -// CHECK: store float [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[B]], i64 3 +// CHECK-NEXT: store float [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1q_lane_f32(float32_t * a, float32x4_t b) { vst1q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst1q_lane_p8( -// CHECK: [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <16 x i8> [[B]], i64 15 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) { vst1q_lane_p8(a, b, 15); } -// CHECK-LABEL: @test_vst1q_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[B]], i64 7 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) { vst1q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_u8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) { vst1_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_u16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) { vst1_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_u32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) { vst1_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_u64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) { vst1_lane_u64(a, b, 0); } -// CHECK-LABEL: @test_vst1_lane_s8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_s8(int8_t * a, int8x8_t b) { vst1_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_s16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_s16(int16_t * a, int16x4_t b) { vst1_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_s32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> -// CHECK: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -// CHECK: store i32 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[B]], i64 1 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s32(int32_t * a, int32x2_t b) { vst1_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_s64( -// CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> -// CHECK: [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0 -// CHECK: store i64 [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i64> [[B]], i64 0 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_s64(int64_t * a, int64x1_t b) { vst1_lane_s64(a, b, 0); } -// CHECK-LABEL: @test_vst1_lane_f16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half> -// CHECK: [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3 -// CHECK: store half [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[B]], i64 3 +// CHECK-NEXT: store half [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_f16(float16_t * a, float16x4_t b) { vst1_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst1_lane_f32( -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float> -// CHECK: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -// CHECK: store float [[TMP3]], ptr %a, align 4 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x float> [[B]], i64 1 +// CHECK-NEXT: store float [[TMP0]], ptr [[A]], align 4 +// CHECK-NEXT: ret void +// void test_vst1_lane_f32(float32_t * a, float32x2_t b) { vst1_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst1_lane_p8( -// CHECK: [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7 -// CHECK: store i8 [[TMP0]], ptr %a, align 1 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x i8> [[B]], i64 7 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[A]], align 1 +// CHECK-NEXT: ret void +// void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) { vst1_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst1_lane_p16( -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16> -// CHECK: [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3 -// CHECK: store i16 [[TMP3]], ptr %a, align 2 -// CHECK: ret void +// CHECK-LABEL: define void @test_vst1_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[B]], i64 3 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +// CHECK-NEXT: ret void +// void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) { vst1_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst2q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) { vst2q_u8(a, b); } -// CHECK-LABEL: @test_vst2q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) { vst2q_u16(a, b); } -// CHECK-LABEL: @test_vst2q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) { vst2q_u32(a, b); } -// CHECK-LABEL: @test_vst2q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_s8(int8_t * a, int8x16x2_t b) { vst2q_s8(a, b); } -// CHECK-LABEL: @test_vst2q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_s16(int16_t * a, int16x8x2_t b) { vst2q_s16(a, b); } -// CHECK-LABEL: @test_vst2q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_s32(int32_t * a, int32x4x2_t b) { vst2q_s32(a, b); } -// CHECK-LABEL: @test_vst2q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_f16(float16_t * a, float16x8x2_t b) { vst2q_f16(a, b); } -// CHECK-LABEL: @test_vst2q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_f32(float32_t * a, float32x4x2_t b) { vst2q_f32(a, b); } -// CHECK-LABEL: @test_vst2q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: call void @llvm.arm.neon.vst2.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) { vst2q_p8(a, b); } -// CHECK-LABEL: @test_vst2q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) { vst2q_p16(a, b); } -// CHECK-LABEL: @test_vst2_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_u8(uint8_t * a, uint8x8x2_t b) { vst2_u8(a, b); } -// CHECK-LABEL: @test_vst2_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_u16(uint16_t * a, uint16x4x2_t b) { vst2_u16(a, b); } -// CHECK-LABEL: @test_vst2_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_u32(uint32_t * a, uint32x2x2_t b) { vst2_u32(a, b); } -// CHECK-LABEL: @test_vst2_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_u64(uint64_t * a, uint64x1x2_t b) { vst2_u64(a, b); } -// CHECK-LABEL: @test_vst2_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_s8(int8_t * a, int8x8x2_t b) { vst2_s8(a, b); } -// CHECK-LABEL: @test_vst2_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_s16(int16_t * a, int16x4x2_t b) { vst2_s16(a, b); } -// CHECK-LABEL: @test_vst2_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_s32(int32_t * a, int32x2x2_t b) { vst2_s32(a, b); } -// CHECK-LABEL: @test_vst2_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst2.p0.v1i64(ptr %a, <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_s64(int64_t * a, int64x1x2_t b) { vst2_s64(a, b); } -// CHECK-LABEL: @test_vst2_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_f16(float16_t * a, float16x4x2_t b) { vst2_f16(a, b); } -// CHECK-LABEL: @test_vst2_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst2.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 4) +// CHECK-NEXT: ret void +// void test_vst2_f32(float32_t * a, float32x2x2_t b) { vst2_f32(a, b); } -// CHECK-LABEL: @test_vst2_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 1) +// CHECK-NEXT: ret void +// void test_vst2_p8(poly8_t * a, poly8x8x2_t b) { vst2_p8(a, b); } -// CHECK-LABEL: @test_vst2_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 2) +// CHECK-NEXT: ret void +// void test_vst2_p16(poly16_t * a, poly16x4x2_t b) { vst2_p16(a, b); } -// CHECK-LABEL: @test_vst2q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) { vst2q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) { vst2q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) { vst2q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %a, <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) { vst2q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr %a, <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) { vst2q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst2q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr %a, <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) { vst2q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst2q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %a, <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) { vst2q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) { vst2_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) { vst2_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) { vst2_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) { vst2_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) { vst2_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %a, <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) { vst2_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr %a, <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) { vst2_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst2_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr %a, <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) { vst2_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst2_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) { vst2_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst2_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 16, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x2_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %a, <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst2_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) { vst2_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst3q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) { vst3q_u8(a, b); } -// CHECK-LABEL: @test_vst3q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) { vst3q_u16(a, b); } -// CHECK-LABEL: @test_vst3q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) { vst3q_u32(a, b); } -// CHECK-LABEL: @test_vst3q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_s8(int8_t * a, int8x16x3_t b) { vst3q_s8(a, b); } -// CHECK-LABEL: @test_vst3q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_s16(int16_t * a, int16x8x3_t b) { vst3q_s16(a, b); } -// CHECK-LABEL: @test_vst3q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_s32(int32_t * a, int32x4x3_t b) { vst3q_s32(a, b); } -// CHECK-LABEL: @test_vst3q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_f16(float16_t * a, float16x8x3_t b) { vst3q_f16(a, b); } -// CHECK-LABEL: @test_vst3q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_f32(float32_t * a, float32x4x3_t b) { vst3q_f32(a, b); } -// CHECK-LABEL: @test_vst3q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: call void @llvm.arm.neon.vst3.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) { vst3q_p8(a, b); } -// CHECK-LABEL: @test_vst3q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) { vst3q_p16(a, b); } -// CHECK-LABEL: @test_vst3_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_u8(uint8_t * a, uint8x8x3_t b) { vst3_u8(a, b); } -// CHECK-LABEL: @test_vst3_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_u16(uint16_t * a, uint16x4x3_t b) { vst3_u16(a, b); } -// CHECK-LABEL: @test_vst3_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_u32(uint32_t * a, uint32x2x3_t b) { vst3_u32(a, b); } -// CHECK-LABEL: @test_vst3_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_u64(uint64_t * a, uint64x1x3_t b) { vst3_u64(a, b); } -// CHECK-LABEL: @test_vst3_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_s8(int8_t * a, int8x8x3_t b) { vst3_s8(a, b); } -// CHECK-LABEL: @test_vst3_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_s16(int16_t * a, int16x4x3_t b) { vst3_s16(a, b); } -// CHECK-LABEL: @test_vst3_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_s32(int32_t * a, int32x2x3_t b) { vst3_s32(a, b); } -// CHECK-LABEL: @test_vst3_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst3.p0.v1i64(ptr %a, <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_s64(int64_t * a, int64x1x3_t b) { vst3_s64(a, b); } -// CHECK-LABEL: @test_vst3_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_f16(float16_t * a, float16x4x3_t b) { vst3_f16(a, b); } -// CHECK-LABEL: @test_vst3_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst3.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 4) +// CHECK-NEXT: ret void +// void test_vst3_f32(float32_t * a, float32x2x3_t b) { vst3_f32(a, b); } -// CHECK-LABEL: @test_vst3_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 1) +// CHECK-NEXT: ret void +// void test_vst3_p8(poly8_t * a, poly8x8x3_t b) { vst3_p8(a, b); } -// CHECK-LABEL: @test_vst3_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 2) +// CHECK-NEXT: ret void +// void test_vst3_p16(poly16_t * a, poly16x4x3_t b) { vst3_p16(a, b); } -// CHECK-LABEL: @test_vst3q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) { vst3q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) { vst3q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) { vst3q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %a, <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) { vst3q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr %a, <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) { vst3q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst3q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr %a, <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) { vst3q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst3q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [6 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 48, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %a, <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [6 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [6 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) { vst3q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) { vst3_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) { vst3_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) { vst3_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) { vst3_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) { vst3_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %a, <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) { vst3_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr %a, <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) { vst3_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst3_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr %a, <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) { vst3_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst3_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) { vst3_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst3_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 24, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x3_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %a, <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst3_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) { vst3_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vst4q_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) { vst4q_u8(a, b); } -// CHECK-LABEL: @test_vst4q_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) { vst4q_u16(a, b); } -// CHECK-LABEL: @test_vst4q_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) { vst4q_u32(a, b); } -// CHECK-LABEL: @test_vst4q_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_s8(int8_t * a, int8x16x4_t b) { vst4q_s8(a, b); } -// CHECK-LABEL: @test_vst4q_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_s16(int16_t * a, int16x8x4_t b) { vst4q_s16(a, b); } -// CHECK-LABEL: @test_vst4q_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_s32(int32_t * a, int32x4x4_t b) { vst4q_s32(a, b); } -// CHECK-LABEL: @test_vst4q_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_f16(float16_t * a, float16x8x4_t b) { vst4q_f16(a, b); } -// CHECK-LABEL: @test_vst4q_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_f32(float32_t * a, float32x4x4_t b) { vst4q_f32(a, b); } -// CHECK-LABEL: @test_vst4q_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <16 x i8>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <16 x i8>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <16 x i8>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x16x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <16 x i8>, ptr [[ARRAYIDX6]], align 16 -// CHECK: call void @llvm.arm.neon.vst4.p0.v16i8(ptr %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <16 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v16i8(ptr [[A]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) { vst4q_p8(a, b); } -// CHECK-LABEL: @test_vst4q_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) { vst4q_p16(a, b); } -// CHECK-LABEL: @test_vst4_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_u8(uint8_t * a, uint8x8x4_t b) { vst4_u8(a, b); } -// CHECK-LABEL: @test_vst4_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_u16(uint16_t * a, uint16x4x4_t b) { vst4_u16(a, b); } -// CHECK-LABEL: @test_vst4_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_u32(uint32_t * a, uint32x2x4_t b) { vst4_u32(a, b); } -// CHECK-LABEL: @test_vst4_u64( -// CHECK: [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_u64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_3_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_u64(uint64_t * a, uint64x1x4_t b) { vst4_u64(a, b); } -// CHECK-LABEL: @test_vst4_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_s8(int8_t * a, int8x8x4_t b) { vst4_s8(a, b); } -// CHECK-LABEL: @test_vst4_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_s16(int16_t * a, int16x4x4_t b) { vst4_s16(a, b); } -// CHECK-LABEL: @test_vst4_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_s32(int32_t * a, int32x2x4_t b) { vst4_s32(a, b); } -// CHECK-LABEL: @test_vst4_s64( -// CHECK: [[B:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <1 x i64>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <1 x i64>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <1 x i64>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int64x1x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <1 x i64>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64> -// CHECK: call void @llvm.arm.neon.vst4.p0.v1i64(ptr %a, <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_s64( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP1:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_1_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x i64> poison, i64 [[B_COERCE_FCA_3_EXTRACT]], i64 0 +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v1i64(ptr [[A]], <1 x i64> [[TMP0]], <1 x i64> [[TMP1]], <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_s64(int64_t * a, int64x1x4_t b) { vst4_s64(a, b); } -// CHECK-LABEL: @test_vst4_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_f16(float16_t * a, float16x4x4_t b) { vst4_f16(a, b); } -// CHECK-LABEL: @test_vst4_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst4.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 4) +// CHECK-NEXT: ret void +// void test_vst4_f32(float32_t * a, float32x2x4_t b) { vst4_f32(a, b); } -// CHECK-LABEL: @test_vst4_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 1) +// CHECK-NEXT: ret void +// void test_vst4_p8(poly8_t * a, poly8x8x4_t b) { vst4_p8(a, b); } -// CHECK-LABEL: @test_vst4_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 2) +// CHECK-NEXT: ret void +// void test_vst4_p16(poly16_t * a, poly16x4x4_t b) { vst4_p16(a, b); } -// CHECK-LABEL: @test_vst4q_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) { vst4q_lane_u16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) { vst4q_lane_u32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) { vst4q_lane_s16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %a, <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr [[A]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) { vst4q_lane_s32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x half>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x half>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x half>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x half>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr %a, <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8f16(ptr [[A]], <8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) { vst4q_lane_f16(a, b, 7); } -// CHECK-LABEL: @test_vst4q_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x float>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x float>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr %a, <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <4 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr [[A]], <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]], i32 3, i32 4) +// CHECK-NEXT: ret void +// void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) { vst4q_lane_f32(a, b, 3); } -// CHECK-LABEL: @test_vst4q_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [8 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 16 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 16 [[__S1]], ptr align 16 [[B]], i32 64, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <8 x i16>, ptr [[ARRAYIDX]], align 16 -// CHECK: [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <8 x i16>, ptr [[ARRAYIDX2]], align 16 -// CHECK: [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <8 x i16>, ptr [[ARRAYIDX4]], align 16 -// CHECK: [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <8 x i16>, ptr [[ARRAYIDX6]], align 16 -// CHECK: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %a, <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4q_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [8 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_0_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_0_0_VEC_INSERT]], i64 [[B_COERCE_FCA_1_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_SROA_3_16_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_2_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[B_SROA_3_24_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_3_16_VEC_INSERT]], i64 [[B_COERCE_FCA_3_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_4_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 4 +// CHECK-NEXT: [[B_SROA_6_32_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_4_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_5_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 5 +// CHECK-NEXT: [[B_SROA_6_40_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_6_32_VEC_INSERT]], i64 [[B_COERCE_FCA_5_EXTRACT]], i64 1 +// CHECK-NEXT: [[B_COERCE_FCA_6_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 6 +// CHECK-NEXT: [[B_SROA_9_48_VEC_INSERT:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE_FCA_6_EXTRACT]], i64 0 +// CHECK-NEXT: [[B_COERCE_FCA_7_EXTRACT:%.*]] = extractvalue [8 x i64] [[B_COERCE]], 7 +// CHECK-NEXT: [[B_SROA_9_56_VEC_INSERT:%.*]] = insertelement <2 x i64> [[B_SROA_9_48_VEC_INSERT]], i64 [[B_COERCE_FCA_7_EXTRACT]], i64 1 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[B_SROA_0_8_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[B_SROA_3_24_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[B_SROA_6_40_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i64> [[B_SROA_9_56_VEC_INSERT]] to <8 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr [[A]], <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], i32 7, i32 2) +// CHECK-NEXT: ret void +// void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) { vst4q_lane_p16(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_u8( -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) { vst4_lane_u8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_u16( -// CHECK: [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) { vst4_lane_u16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_u32( -// CHECK: [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.uint32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_u32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) { vst4_lane_u32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_s8( -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) { vst4_lane_s8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_s16( -// CHECK: [[B:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) { vst4_lane_s16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_s32( -// CHECK: [[B:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x i32>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.int32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %a, <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_s32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x i32> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x i32> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr [[A]], <2 x i32> [[TMP0]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) { vst4_lane_s32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_f16( -// CHECK: [[B:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x half>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x half>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x half>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x half>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr %a, <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_f16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x half> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x half> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4f16(ptr [[A]], <4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x half> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) { vst4_lane_f16(a, b, 3); } -// CHECK-LABEL: @test_vst4_lane_f32( -// CHECK: [[B:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <2 x float>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.float32x2x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <2 x float>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr %a, <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_f32( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <2 x float> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <2 x float> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr [[A]], <2 x float> [[TMP0]], <2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]], i32 1, i32 4) +// CHECK-NEXT: ret void +// void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) { vst4_lane_f32(a, b, 1); } -// CHECK-LABEL: @test_vst4_lane_p8( -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP3:%.*]] = load <8 x i8>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6]], align 8 -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_p8( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i32 7, i32 1) +// CHECK-NEXT: ret void +// void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) { vst4_lane_p8(a, b, 7); } -// CHECK-LABEL: @test_vst4_lane_p16( -// CHECK: [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[__S1]], ptr align 8 [[B]], i32 32, i1 false) -// CHECK: [[VAL:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL]], i32 0, i32 0 -// CHECK: [[TMP4:%.*]] = load <4 x i16>, ptr [[ARRAYIDX]], align 8 -// CHECK: [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8> -// CHECK: [[VAL1:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL1]], i32 0, i32 1 -// CHECK: [[TMP6:%.*]] = load <4 x i16>, ptr [[ARRAYIDX2]], align 8 -// CHECK: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8> -// CHECK: [[VAL3:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL3]], i32 0, i32 2 -// CHECK: [[TMP8:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 8 -// CHECK: [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8> -// CHECK: [[VAL5:%.*]] = getelementptr inbounds nuw %struct.poly16x4x4_t, ptr [[__S1]], i32 0, i32 0 -// CHECK: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], ptr [[VAL5]], i32 0, i32 3 -// CHECK: [[TMP10:%.*]] = load <4 x i16>, ptr [[ARRAYIDX6]], align 8 -// CHECK: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8> -// CHECK: [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16> -// CHECK: [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> -// CHECK: [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16> -// CHECK: [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16> -// CHECK: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %a, <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2) -// CHECK: ret void +// CHECK-LABEL: define void @test_vst4_lane_p16( +// CHECK-SAME: ptr noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[B_COERCE_FCA_0_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[B_COERCE_FCA_1_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_COERCE_FCA_2_EXTRACT]] to <4 x i16> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_COERCE_FCA_3_EXTRACT]] to <4 x i16> +// CHECK-NEXT: call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr [[A]], <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], i32 3, i32 2) +// CHECK-NEXT: ret void +// void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) { vst4_lane_p16(a, b, 3); } -// CHECK-LABEL: @test_vsub_s8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vsub_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) { return vsub_s8(a, b); } -// CHECK-LABEL: @test_vsub_s16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vsub_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) { return vsub_s16(a, b); } -// CHECK-LABEL: @test_vsub_s32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vsub_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) { return vsub_s32(a, b); } -// CHECK-LABEL: @test_vsub_s64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define <1 x i64> @test_vsub_s64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) { return vsub_s64(a, b); } -// CHECK-LABEL: @test_vsub_f32( -// CHECK: [[SUB_I:%.*]] = fsub <2 x float> %a, %b -// CHECK: ret <2 x float> [[SUB_I]] +// CHECK-LABEL: define <2 x float> @test_vsub_f32( +// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[B]] +// CHECK-NEXT: ret <2 x float> [[SUB_I]] +// float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) { return vsub_f32(a, b); } -// CHECK-LABEL: @test_vsub_u8( -// CHECK: [[SUB_I:%.*]] = sub <8 x i8> %a, %b -// CHECK: ret <8 x i8> [[SUB_I]] +// CHECK-LABEL: define <8 x i8> @test_vsub_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i8> [[SUB_I]] +// uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) { return vsub_u8(a, b); } -// CHECK-LABEL: @test_vsub_u16( -// CHECK: [[SUB_I:%.*]] = sub <4 x i16> %a, %b -// CHECK: ret <4 x i16> [[SUB_I]] +// CHECK-LABEL: define <4 x i16> @test_vsub_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i16> [[SUB_I]] +// uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) { return vsub_u16(a, b); } -// CHECK-LABEL: @test_vsub_u32( -// CHECK: [[SUB_I:%.*]] = sub <2 x i32> %a, %b -// CHECK: ret <2 x i32> [[SUB_I]] +// CHECK-LABEL: define <2 x i32> @test_vsub_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i32> [[SUB_I]] +// uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) { return vsub_u32(a, b); } -// CHECK-LABEL: @test_vsub_u64( -// CHECK: [[SUB_I:%.*]] = sub <1 x i64> %a, %b -// CHECK: ret <1 x i64> [[SUB_I]] +// CHECK-LABEL: define <1 x i64> @test_vsub_u64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <1 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <1 x i64> [[SUB_I]] +// uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) { return vsub_u64(a, b); } -// CHECK-LABEL: @test_vsubq_s8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vsubq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) { return vsubq_s8(a, b); } -// CHECK-LABEL: @test_vsubq_s16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) { return vsubq_s16(a, b); } -// CHECK-LABEL: @test_vsubq_s32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) { return vsubq_s32(a, b); } -// CHECK-LABEL: @test_vsubq_s64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubq_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) { return vsubq_s64(a, b); } -// CHECK-LABEL: @test_vsubq_f32( -// CHECK: [[SUB_I:%.*]] = fsub <4 x float> %a, %b -// CHECK: ret <4 x float> [[SUB_I]] +// CHECK-LABEL: define <4 x float> @test_vsubq_f32( +// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[B]] +// CHECK-NEXT: ret <4 x float> [[SUB_I]] +// float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) { return vsubq_f32(a, b); } -// CHECK-LABEL: @test_vsubq_u8( -// CHECK: [[SUB_I:%.*]] = sub <16 x i8> %a, %b -// CHECK: ret <16 x i8> [[SUB_I]] +// CHECK-LABEL: define <16 x i8> @test_vsubq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <16 x i8> [[A]], [[B]] +// CHECK-NEXT: ret <16 x i8> [[SUB_I]] +// uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) { return vsubq_u8(a, b); } -// CHECK-LABEL: @test_vsubq_u16( -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) { return vsubq_u16(a, b); } -// CHECK-LABEL: @test_vsubq_u32( -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) { return vsubq_u32(a, b); } -// CHECK-LABEL: @test_vsubq_u64( -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubq_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) { return vsubq_u64(a, b); } -// CHECK-LABEL: @test_vsubhn_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) { return vsubhn_s16(a, b); } -// CHECK-LABEL: @test_vsubhn_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) { return vsubhn_s32(a, b); } -// CHECK-LABEL: @test_vsubhn_s64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) { return vsubhn_s64(a, b); } -// CHECK-LABEL: @test_vsubhn_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <8 x i16> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8> -// CHECK: ret <8 x i8> [[VSUBHN2_I]] +// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], splat (i16 8) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <8 x i16> [[VSUBHN1_I]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VSUBHN2_I]] +// uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) { return vsubhn_u16(a, b); } -// CHECK-LABEL: @test_vsubhn_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <4 x i32> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16> -// CHECK: ret <4 x i16> [[VSUBHN2_I]] +// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], splat (i32 16) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <4 x i32> [[VSUBHN1_I]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VSUBHN2_I]] +// uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) { return vsubhn_u32(a, b); } -// CHECK-LABEL: @test_vsubhn_u64( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8> -// CHECK: [[VSUBHN_I:%.*]] = sub <2 x i64> %a, %b -// CHECK: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) -// CHECK: [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32> -// CHECK: ret <2 x i32> [[VSUBHN2_I]] +// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VSUBHN_I:%.*]] = sub <2 x i64> [[A]], [[B]] +// CHECK-NEXT: [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], splat (i64 32) +// CHECK-NEXT: [[VSUBHN2_I:%.*]] = trunc nuw <2 x i64> [[VSUBHN1_I]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VSUBHN2_I]] +// uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) { return vsubhn_u64(a, b); } -// CHECK-LABEL: @test_vsubl_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubl_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) { return vsubl_s8(a, b); } -// CHECK-LABEL: @test_vsubl_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubl_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) { return vsubl_s16(a, b); } -// CHECK-LABEL: @test_vsubl_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubl_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) { return vsubl_s32(a, b); } -// CHECK-LABEL: @test_vsubl_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubl_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <8 x i8> [[A]] to <8 x i16> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <8 x i16> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) { return vsubl_u8(a, b); } -// CHECK-LABEL: @test_vsubl_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %a to <4 x i32> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubl_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[A]] to <4 x i32> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <4 x i32> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) { return vsubl_u16(a, b); } -// CHECK-LABEL: @test_vsubl_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %a to <2 x i64> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I4_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubl_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[A]] to <2 x i64> +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub nsw <2 x i64> [[VMOVL_I4_I]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) { return vsubl_u32(a, b); } -// CHECK-LABEL: @test_vsubw_s8( -// CHECK: [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubw_s8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) { return vsubw_s8(a, b); } -// CHECK-LABEL: @test_vsubw_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubw_s16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) { return vsubw_s16(a, b); } -// CHECK-LABEL: @test_vsubw_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = sext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubw_s32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = sext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) { return vsubw_s32(a, b); } -// CHECK-LABEL: @test_vsubw_u8( -// CHECK: [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16> -// CHECK: [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]] -// CHECK: ret <8 x i16> [[SUB_I]] +// CHECK-LABEL: define <8 x i16> @test_vsubw_u8( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <8 x i8> [[B]] to <8 x i16> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <8 x i16> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <8 x i16> [[SUB_I]] +// uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) { return vsubw_u8(a, b); } -// CHECK-LABEL: @test_vsubw_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <4 x i16> %b to <4 x i32> -// CHECK: [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]] -// CHECK: ret <4 x i32> [[SUB_I]] +// CHECK-LABEL: define <4 x i32> @test_vsubw_u16( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <4 x i16> [[B]] to <4 x i32> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <4 x i32> [[SUB_I]] +// uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) { return vsubw_u16(a, b); } -// CHECK-LABEL: @test_vsubw_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VMOVL_I_I:%.*]] = zext <2 x i32> %b to <2 x i64> -// CHECK: [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]] -// CHECK: ret <2 x i64> [[SUB_I]] +// CHECK-LABEL: define <2 x i64> @test_vsubw_u32( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VMOVL_I_I:%.*]] = zext <2 x i32> [[B]] to <2 x i64> +// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A]], [[VMOVL_I_I]] +// CHECK-NEXT: ret <2 x i64> [[SUB_I]] +// uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) { return vsubw_u32(a, b); } -// CHECK-LABEL: @test_vtbl1_u8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) { return vtbl1_u8(a, b); } -// CHECK-LABEL: @test_vtbl1_s8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) { return vtbl1_s8(a, b); } -// CHECK-LABEL: @test_vtbl1_p8( -// CHECK: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) { return vtbl1_p8(a, b); } -// CHECK-LABEL: @test_vtbl2_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) { return vtbl2_u8(a, b); } -// CHECK-LABEL: @test_vtbl2_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) { return vtbl2_s8(a, b); } -// CHECK-LABEL: @test_vtbl2_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [2 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8( +// CHECK-SAME: [2 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) { return vtbl2_p8(a, b); } -// CHECK-LABEL: @test_vtbl3_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) { return vtbl3_u8(a, b); } -// CHECK-LABEL: @test_vtbl3_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) { return vtbl3_s8(a, b); } -// CHECK-LABEL: @test_vtbl3_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [3 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8( +// CHECK-SAME: [3 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) { return vtbl3_p8(a, b); } -// CHECK-LABEL: @test_vtbl4_u8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) { return vtbl4_u8(a, b); } -// CHECK-LABEL: @test_vtbl4_s8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) { return vtbl4_s8(a, b); } -// CHECK-LABEL: @test_vtbl4_p8( -// CHECK: [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: store [4 x i64] [[A]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[A]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P0_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P0_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P0_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) -// CHECK: ret <8 x i8> [[VTBL4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8( +// CHECK-SAME: [4 x i64] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[A_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) { return vtbl4_p8(a, b); } -// CHECK-LABEL: @test_vtbx1_u8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) { return vtbx1_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx1_s8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) { return vtbx1_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx1_p8( -// CHECK: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX1_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> [[A]], <8 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) { return vtbx1_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) { return vtbx2_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) { return vtbx2_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx2_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [2 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x2_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [2 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [2 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX2_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [2 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_1_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) { return vtbx2_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) { return vtbx3_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) { return vtbx3_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx3_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [3 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x3_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [3 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [3 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX3_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [3 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_2_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) { return vtbx3_p8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_u8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.uint8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) { return vtbx4_u8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_s8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.int8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) { return vtbx4_s8(a, b, c); } -// CHECK-LABEL: @test_vtbx4_p8( -// CHECK: [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: store [4 x i64] [[B]].coerce, ptr [[COERCE_DIVE]], align 8 -// CHECK: [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw %struct.poly8x8x4_t, ptr [[B]], i32 0, i32 0 -// CHECK: [[TMP2:%.*]] = load [4 x i64], ptr [[COERCE_DIVE1]], align 8 -// CHECK: store [4 x i64] [[TMP2]], ptr [[__P1_I]], align 8 -// CHECK: [[TMP4:%.*]] = load <8 x i8>, ptr [[__P1_I]], align 8 -// CHECK: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 1 -// CHECK: [[TMP5:%.*]] = load <8 x i8>, ptr [[ARRAYIDX2_I]], align 8 -// CHECK: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 2 -// CHECK: [[TMP6:%.*]] = load <8 x i8>, ptr [[ARRAYIDX4_I]], align 8 -// CHECK: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], ptr [[__P1_I]], i32 0, i32 3 -// CHECK: [[TMP7:%.*]] = load <8 x i8>, ptr [[ARRAYIDX6_I]], align 8 -// CHECK: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) -// CHECK: ret <8 x i8> [[VTBX4_I]] +// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], [4 x i64] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_0_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 1 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_1_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 2 +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_2_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i64] [[B_COERCE]], 3 +// CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[DOTFCA_3_INSERT_FCA_3_EXTRACT]] to <8 x i8> +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> [[A]], <8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) { return vtbx4_p8(a, b, c); } -// CHECK: @test_vtrn_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META3:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META3]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK: @test_vtrn_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META6:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META6]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK: @test_vtrn_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META9:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META9]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK: @test_vtrn_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META12:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META12]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK: @test_vtrn_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META15:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META15]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK: @test_vtrn_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META18:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META18]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK: @test_vtrn_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META21:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META21]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK: @test_vtrn_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META24:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META24]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK: @test_vtrn_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrn_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META27:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META27]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK: @test_vtrnq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META30:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META30]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK: @test_vtrnq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META33:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META33]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK: @test_vtrnq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META36:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META36]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK: @test_vtrnq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META39:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META39]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK: @test_vtrnq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META42:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META42]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK: @test_vtrnq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META45:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META45]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK: @test_vtrnq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META48:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META48]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK: @test_vtrnq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VTRN1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META51:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META51]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK: @test_vtrnq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VTRN1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VTRN1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vtrnq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META54:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VTRN1_I]], ptr [[TMP0]], align 4, !alias.scope [[META54]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) { return vtrnq_p16(a, b); } -// CHECK-LABEL: @test_vtst_s8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_s8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) { return vtst_s8(a, b); } -// CHECK-LABEL: @test_vtst_s16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_s16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) { return vtst_s16(a, b); } -// CHECK-LABEL: @test_vtst_s32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define <2 x i32> @test_vtst_s32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) { return vtst_s32(a, b); } -// CHECK-LABEL: @test_vtst_u8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_u8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) { return vtst_u8(a, b); } -// CHECK-LABEL: @test_vtst_u16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_u16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) { return vtst_u16(a, b); } -// CHECK-LABEL: @test_vtst_u32( -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <2 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <2 x i1> [[TMP3]] to <2 x i32> -// CHECK: ret <2 x i32> [[VTST_I]] +// CHECK-LABEL: define <2 x i32> @test_vtst_u32( +// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <2 x i32> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32> +// CHECK-NEXT: ret <2 x i32> [[VTST_I]] +// uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) { return vtst_u32(a, b); } -// CHECK-LABEL: @test_vtst_p8( -// CHECK: [[TMP0:%.*]] = and <8 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> -// CHECK: ret <8 x i8> [[VTST_I]] +// CHECK-LABEL: define <8 x i8> @test_vtst_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[VTST_I]] +// uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) { return vtst_p8(a, b); } -// CHECK-LABEL: @test_vtst_p16( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i16> -// CHECK: ret <4 x i16> [[VTST_I]] +// CHECK-LABEL: define <4 x i16> @test_vtst_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16> +// CHECK-NEXT: ret <4 x i16> [[VTST_I]] +// uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) { return vtst_p16(a, b); } -// CHECK-LABEL: @test_vtstq_s8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_s8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) { return vtstq_s8(a, b); } -// CHECK-LABEL: @test_vtstq_s16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_s16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) { return vtstq_s16(a, b); } -// CHECK-LABEL: @test_vtstq_s32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define <4 x i32> @test_vtstq_s32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) { return vtstq_s32(a, b); } -// CHECK-LABEL: @test_vtstq_u8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_u8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) { return vtstq_u8(a, b); } -// CHECK-LABEL: @test_vtstq_u16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_u16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) { return vtstq_u16(a, b); } -// CHECK-LABEL: @test_vtstq_u32( -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <4 x i32> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <4 x i1> [[TMP3]] to <4 x i32> -// CHECK: ret <4 x i32> [[VTST_I]] +// CHECK-LABEL: define <4 x i32> @test_vtstq_u32( +// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[VTST_I]] +// uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) { return vtstq_u32(a, b); } -// CHECK-LABEL: @test_vtstq_p8( -// CHECK: [[TMP0:%.*]] = and <16 x i8> %a, %b -// CHECK: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> -// CHECK: ret <16 x i8> [[VTST_I]] +// CHECK-LABEL: define <16 x i8> @test_vtstq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <16 x i8> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[VTST_I]] +// uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) { return vtstq_p8(a, b); } -// CHECK-LABEL: @test_vtstq_p16( -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[TMP2:%.*]] = and <8 x i16> %a, %b -// CHECK: [[TMP3:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer -// CHECK: [[VTST_I:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i16> -// CHECK: ret <8 x i16> [[VTST_I]] +// CHECK-LABEL: define <8 x i16> @test_vtstq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[A]], [[B]] +// CHECK-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +// CHECK-NEXT: [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[VTST_I]] +// uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) { return vtstq_p16(a, b); } -// CHECK: @test_vuzp_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META57:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META57]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK: @test_vuzp_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META60:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META60]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK: @test_vuzp_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META63:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META63]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK: @test_vuzp_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META66:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META66]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK: @test_vuzp_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META69:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META69]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK: @test_vuzp_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META72:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META72]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK: @test_vuzp_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META75:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META75]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK: @test_vuzp_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META78:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META78]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK: @test_vuzp_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzp_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META81:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META81]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK: @test_vuzpq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META84:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META84]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK: @test_vuzpq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META87:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META87]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK: @test_vuzpq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META90:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META90]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK: @test_vuzpq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META93:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META93]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK: @test_vuzpq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META96:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META96]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK: @test_vuzpq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META99:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META99]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK: @test_vuzpq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META102:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META102]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK: @test_vuzpq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VUZP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META105:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META105]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK: @test_vuzpq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VUZP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VUZP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vuzpq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META108:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VUZP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META108]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK: @test_vzip_s8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META111:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META111]] +// CHECK-NEXT: ret void +// int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK: @test_vzip_s16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META114:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META114]] +// CHECK-NEXT: ret void +// int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK: @test_vzip_s32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META117:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META117]] +// CHECK-NEXT: ret void +// int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK: @test_vzip_u8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META120:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META120]] +// CHECK-NEXT: ret void +// uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK: @test_vzip_u16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META123:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META123]] +// CHECK-NEXT: ret void +// uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK: @test_vzip_u32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> -// CHECK: store <2 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META126:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META126]] +// CHECK-NEXT: ret void +// uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK: @test_vzip_f32({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <2 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> -// CHECK: store <2 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X2X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META129:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> +// CHECK-NEXT: store <2 x float> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META129]] +// CHECK-NEXT: ret void +// float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK: @test_vzip_p8({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> -// CHECK: store <8 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X8X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META132:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META132]] +// CHECK-NEXT: ret void +// poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK: @test_vzip_p16({{.*}} sret({{.*}}) align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> -// CHECK: store <4 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzip_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X4X2_T:%.*]]) align 8 [[AGG_RESULT:%.*]], <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META135:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 8 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META135]] +// CHECK-NEXT: ret void +// poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK: @test_vzipq_s8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META138:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META138]] +// CHECK-NEXT: ret void +// int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK: @test_vzipq_s16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META141:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META141]] +// CHECK-NEXT: ret void +// int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK: @test_vzipq_s32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_s32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_INT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META144:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META144]] +// CHECK-NEXT: ret void +// int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK: @test_vzipq_u8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META147:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META147]] +// CHECK-NEXT: ret void +// uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK: @test_vzipq_u16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META150:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META150]] +// CHECK-NEXT: ret void +// uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK: @test_vzipq_u32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> -// CHECK: store <4 x i32> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_u32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_UINT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META153:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x i32> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META153]] +// CHECK-NEXT: ret void +// uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK: @test_vzipq_f32({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <4 x float>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> -// CHECK: store <4 x float> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_f32( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_FLOAT32X4X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META156:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> +// CHECK-NEXT: store <4 x float> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META156]] +// CHECK-NEXT: ret void +// float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK: @test_vzipq_p8({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> -// CHECK: store <16 x i8> [[VZIP1_I]], ptr [[TMP2]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_p8( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY8X16X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META159:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: store <16 x i8> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META159]] +// CHECK-NEXT: ret void +// poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK: @test_vzipq_p16({{.*}} sret({{.*}}) align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> -// CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope -// CHECK: [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[AGG_RESULT]], i32 1 -// CHECK: [[VZIP1_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> -// CHECK: store <8 x i16> [[VZIP1_I]], ptr [[TMP4]], align 4, !alias.scope -// CHECK: ret void +// CHECK-LABEL: define void @test_vzipq_p16( +// CHECK-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_POLY16X8X2_T:%.*]]) align 16 [[AGG_RESULT:%.*]], <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP_I]], ptr [[AGG_RESULT]], align 4, !alias.scope [[META162:![0-9]+]] +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[AGG_RESULT]], i32 16 +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> +// CHECK-NEXT: store <8 x i16> [[VZIP1_I]], ptr [[TMP0]], align 4, !alias.scope [[META162]] +// CHECK-NEXT: ret void +// poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) { return vzipq_p16(a, b); } +//. +// CHECK: [[META3]] = !{[[META4:![0-9]+]]} +// CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"vtrn_s8: %agg.result"} +// CHECK: [[META5]] = distinct !{[[META5]], !"vtrn_s8"} +// CHECK: [[META6]] = !{[[META7:![0-9]+]]} +// CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"vtrn_s16: %agg.result"} +// CHECK: [[META8]] = distinct !{[[META8]], !"vtrn_s16"} +// CHECK: [[META9]] = !{[[META10:![0-9]+]]} +// CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"vtrn_s32: %agg.result"} +// CHECK: [[META11]] = distinct !{[[META11]], !"vtrn_s32"} +// CHECK: [[META12]] = !{[[META13:![0-9]+]]} +// CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"vtrn_u8: %agg.result"} +// CHECK: [[META14]] = distinct !{[[META14]], !"vtrn_u8"} +// CHECK: [[META15]] = !{[[META16:![0-9]+]]} +// CHECK: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"vtrn_u16: %agg.result"} +// CHECK: [[META17]] = distinct !{[[META17]], !"vtrn_u16"} +// CHECK: [[META18]] = !{[[META19:![0-9]+]]} +// CHECK: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"vtrn_u32: %agg.result"} +// CHECK: [[META20]] = distinct !{[[META20]], !"vtrn_u32"} +// CHECK: [[META21]] = !{[[META22:![0-9]+]]} +// CHECK: [[META22]] = distinct !{[[META22]], [[META23:![0-9]+]], !"vtrn_f32: %agg.result"} +// CHECK: [[META23]] = distinct !{[[META23]], !"vtrn_f32"} +// CHECK: [[META24]] = !{[[META25:![0-9]+]]} +// CHECK: [[META25]] = distinct !{[[META25]], [[META26:![0-9]+]], !"vtrn_p8: %agg.result"} +// CHECK: [[META26]] = distinct !{[[META26]], !"vtrn_p8"} +// CHECK: [[META27]] = !{[[META28:![0-9]+]]} +// CHECK: [[META28]] = distinct !{[[META28]], [[META29:![0-9]+]], !"vtrn_p16: %agg.result"} +// CHECK: [[META29]] = distinct !{[[META29]], !"vtrn_p16"} +// CHECK: [[META30]] = !{[[META31:![0-9]+]]} +// CHECK: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !"vtrnq_s8: %agg.result"} +// CHECK: [[META32]] = distinct !{[[META32]], !"vtrnq_s8"} +// CHECK: [[META33]] = !{[[META34:![0-9]+]]} +// CHECK: [[META34]] = distinct !{[[META34]], [[META35:![0-9]+]], !"vtrnq_s16: %agg.result"} +// CHECK: [[META35]] = distinct !{[[META35]], !"vtrnq_s16"} +// CHECK: [[META36]] = !{[[META37:![0-9]+]]} +// CHECK: [[META37]] = distinct !{[[META37]], [[META38:![0-9]+]], !"vtrnq_s32: %agg.result"} +// CHECK: [[META38]] = distinct !{[[META38]], !"vtrnq_s32"} +// CHECK: [[META39]] = !{[[META40:![0-9]+]]} +// CHECK: [[META40]] = distinct !{[[META40]], [[META41:![0-9]+]], !"vtrnq_u8: %agg.result"} +// CHECK: [[META41]] = distinct !{[[META41]], !"vtrnq_u8"} +// CHECK: [[META42]] = !{[[META43:![0-9]+]]} +// CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"vtrnq_u16: %agg.result"} +// CHECK: [[META44]] = distinct !{[[META44]], !"vtrnq_u16"} +// CHECK: [[META45]] = !{[[META46:![0-9]+]]} +// CHECK: [[META46]] = distinct !{[[META46]], [[META47:![0-9]+]], !"vtrnq_u32: %agg.result"} +// CHECK: [[META47]] = distinct !{[[META47]], !"vtrnq_u32"} +// CHECK: [[META48]] = !{[[META49:![0-9]+]]} +// CHECK: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"vtrnq_f32: %agg.result"} +// CHECK: [[META50]] = distinct !{[[META50]], !"vtrnq_f32"} +// CHECK: [[META51]] = !{[[META52:![0-9]+]]} +// CHECK: [[META52]] = distinct !{[[META52]], [[META53:![0-9]+]], !"vtrnq_p8: %agg.result"} +// CHECK: [[META53]] = distinct !{[[META53]], !"vtrnq_p8"} +// CHECK: [[META54]] = !{[[META55:![0-9]+]]} +// CHECK: [[META55]] = distinct !{[[META55]], [[META56:![0-9]+]], !"vtrnq_p16: %agg.result"} +// CHECK: [[META56]] = distinct !{[[META56]], !"vtrnq_p16"} +// CHECK: [[META57]] = !{[[META58:![0-9]+]]} +// CHECK: [[META58]] = distinct !{[[META58]], [[META59:![0-9]+]], !"vuzp_s8: %agg.result"} +// CHECK: [[META59]] = distinct !{[[META59]], !"vuzp_s8"} +// CHECK: [[META60]] = !{[[META61:![0-9]+]]} +// CHECK: [[META61]] = distinct !{[[META61]], [[META62:![0-9]+]], !"vuzp_s16: %agg.result"} +// CHECK: [[META62]] = distinct !{[[META62]], !"vuzp_s16"} +// CHECK: [[META63]] = !{[[META64:![0-9]+]]} +// CHECK: [[META64]] = distinct !{[[META64]], [[META65:![0-9]+]], !"vuzp_s32: %agg.result"} +// CHECK: [[META65]] = distinct !{[[META65]], !"vuzp_s32"} +// CHECK: [[META66]] = !{[[META67:![0-9]+]]} +// CHECK: [[META67]] = distinct !{[[META67]], [[META68:![0-9]+]], !"vuzp_u8: %agg.result"} +// CHECK: [[META68]] = distinct !{[[META68]], !"vuzp_u8"} +// CHECK: [[META69]] = !{[[META70:![0-9]+]]} +// CHECK: [[META70]] = distinct !{[[META70]], [[META71:![0-9]+]], !"vuzp_u16: %agg.result"} +// CHECK: [[META71]] = distinct !{[[META71]], !"vuzp_u16"} +// CHECK: [[META72]] = !{[[META73:![0-9]+]]} +// CHECK: [[META73]] = distinct !{[[META73]], [[META74:![0-9]+]], !"vuzp_u32: %agg.result"} +// CHECK: [[META74]] = distinct !{[[META74]], !"vuzp_u32"} +// CHECK: [[META75]] = !{[[META76:![0-9]+]]} +// CHECK: [[META76]] = distinct !{[[META76]], [[META77:![0-9]+]], !"vuzp_f32: %agg.result"} +// CHECK: [[META77]] = distinct !{[[META77]], !"vuzp_f32"} +// CHECK: [[META78]] = !{[[META79:![0-9]+]]} +// CHECK: [[META79]] = distinct !{[[META79]], [[META80:![0-9]+]], !"vuzp_p8: %agg.result"} +// CHECK: [[META80]] = distinct !{[[META80]], !"vuzp_p8"} +// CHECK: [[META81]] = !{[[META82:![0-9]+]]} +// CHECK: [[META82]] = distinct !{[[META82]], [[META83:![0-9]+]], !"vuzp_p16: %agg.result"} +// CHECK: [[META83]] = distinct !{[[META83]], !"vuzp_p16"} +// CHECK: [[META84]] = !{[[META85:![0-9]+]]} +// CHECK: [[META85]] = distinct !{[[META85]], [[META86:![0-9]+]], !"vuzpq_s8: %agg.result"} +// CHECK: [[META86]] = distinct !{[[META86]], !"vuzpq_s8"} +// CHECK: [[META87]] = !{[[META88:![0-9]+]]} +// CHECK: [[META88]] = distinct !{[[META88]], [[META89:![0-9]+]], !"vuzpq_s16: %agg.result"} +// CHECK: [[META89]] = distinct !{[[META89]], !"vuzpq_s16"} +// CHECK: [[META90]] = !{[[META91:![0-9]+]]} +// CHECK: [[META91]] = distinct !{[[META91]], [[META92:![0-9]+]], !"vuzpq_s32: %agg.result"} +// CHECK: [[META92]] = distinct !{[[META92]], !"vuzpq_s32"} +// CHECK: [[META93]] = !{[[META94:![0-9]+]]} +// CHECK: [[META94]] = distinct !{[[META94]], [[META95:![0-9]+]], !"vuzpq_u8: %agg.result"} +// CHECK: [[META95]] = distinct !{[[META95]], !"vuzpq_u8"} +// CHECK: [[META96]] = !{[[META97:![0-9]+]]} +// CHECK: [[META97]] = distinct !{[[META97]], [[META98:![0-9]+]], !"vuzpq_u16: %agg.result"} +// CHECK: [[META98]] = distinct !{[[META98]], !"vuzpq_u16"} +// CHECK: [[META99]] = !{[[META100:![0-9]+]]} +// CHECK: [[META100]] = distinct !{[[META100]], [[META101:![0-9]+]], !"vuzpq_u32: %agg.result"} +// CHECK: [[META101]] = distinct !{[[META101]], !"vuzpq_u32"} +// CHECK: [[META102]] = !{[[META103:![0-9]+]]} +// CHECK: [[META103]] = distinct !{[[META103]], [[META104:![0-9]+]], !"vuzpq_f32: %agg.result"} +// CHECK: [[META104]] = distinct !{[[META104]], !"vuzpq_f32"} +// CHECK: [[META105]] = !{[[META106:![0-9]+]]} +// CHECK: [[META106]] = distinct !{[[META106]], [[META107:![0-9]+]], !"vuzpq_p8: %agg.result"} +// CHECK: [[META107]] = distinct !{[[META107]], !"vuzpq_p8"} +// CHECK: [[META108]] = !{[[META109:![0-9]+]]} +// CHECK: [[META109]] = distinct !{[[META109]], [[META110:![0-9]+]], !"vuzpq_p16: %agg.result"} +// CHECK: [[META110]] = distinct !{[[META110]], !"vuzpq_p16"} +// CHECK: [[META111]] = !{[[META112:![0-9]+]]} +// CHECK: [[META112]] = distinct !{[[META112]], [[META113:![0-9]+]], !"vzip_s8: %agg.result"} +// CHECK: [[META113]] = distinct !{[[META113]], !"vzip_s8"} +// CHECK: [[META114]] = !{[[META115:![0-9]+]]} +// CHECK: [[META115]] = distinct !{[[META115]], [[META116:![0-9]+]], !"vzip_s16: %agg.result"} +// CHECK: [[META116]] = distinct !{[[META116]], !"vzip_s16"} +// CHECK: [[META117]] = !{[[META118:![0-9]+]]} +// CHECK: [[META118]] = distinct !{[[META118]], [[META119:![0-9]+]], !"vzip_s32: %agg.result"} +// CHECK: [[META119]] = distinct !{[[META119]], !"vzip_s32"} +// CHECK: [[META120]] = !{[[META121:![0-9]+]]} +// CHECK: [[META121]] = distinct !{[[META121]], [[META122:![0-9]+]], !"vzip_u8: %agg.result"} +// CHECK: [[META122]] = distinct !{[[META122]], !"vzip_u8"} +// CHECK: [[META123]] = !{[[META124:![0-9]+]]} +// CHECK: [[META124]] = distinct !{[[META124]], [[META125:![0-9]+]], !"vzip_u16: %agg.result"} +// CHECK: [[META125]] = distinct !{[[META125]], !"vzip_u16"} +// CHECK: [[META126]] = !{[[META127:![0-9]+]]} +// CHECK: [[META127]] = distinct !{[[META127]], [[META128:![0-9]+]], !"vzip_u32: %agg.result"} +// CHECK: [[META128]] = distinct !{[[META128]], !"vzip_u32"} +// CHECK: [[META129]] = !{[[META130:![0-9]+]]} +// CHECK: [[META130]] = distinct !{[[META130]], [[META131:![0-9]+]], !"vzip_f32: %agg.result"} +// CHECK: [[META131]] = distinct !{[[META131]], !"vzip_f32"} +// CHECK: [[META132]] = !{[[META133:![0-9]+]]} +// CHECK: [[META133]] = distinct !{[[META133]], [[META134:![0-9]+]], !"vzip_p8: %agg.result"} +// CHECK: [[META134]] = distinct !{[[META134]], !"vzip_p8"} +// CHECK: [[META135]] = !{[[META136:![0-9]+]]} +// CHECK: [[META136]] = distinct !{[[META136]], [[META137:![0-9]+]], !"vzip_p16: %agg.result"} +// CHECK: [[META137]] = distinct !{[[META137]], !"vzip_p16"} +// CHECK: [[META138]] = !{[[META139:![0-9]+]]} +// CHECK: [[META139]] = distinct !{[[META139]], [[META140:![0-9]+]], !"vzipq_s8: %agg.result"} +// CHECK: [[META140]] = distinct !{[[META140]], !"vzipq_s8"} +// CHECK: [[META141]] = !{[[META142:![0-9]+]]} +// CHECK: [[META142]] = distinct !{[[META142]], [[META143:![0-9]+]], !"vzipq_s16: %agg.result"} +// CHECK: [[META143]] = distinct !{[[META143]], !"vzipq_s16"} +// CHECK: [[META144]] = !{[[META145:![0-9]+]]} +// CHECK: [[META145]] = distinct !{[[META145]], [[META146:![0-9]+]], !"vzipq_s32: %agg.result"} +// CHECK: [[META146]] = distinct !{[[META146]], !"vzipq_s32"} +// CHECK: [[META147]] = !{[[META148:![0-9]+]]} +// CHECK: [[META148]] = distinct !{[[META148]], [[META149:![0-9]+]], !"vzipq_u8: %agg.result"} +// CHECK: [[META149]] = distinct !{[[META149]], !"vzipq_u8"} +// CHECK: [[META150]] = !{[[META151:![0-9]+]]} +// CHECK: [[META151]] = distinct !{[[META151]], [[META152:![0-9]+]], !"vzipq_u16: %agg.result"} +// CHECK: [[META152]] = distinct !{[[META152]], !"vzipq_u16"} +// CHECK: [[META153]] = !{[[META154:![0-9]+]]} +// CHECK: [[META154]] = distinct !{[[META154]], [[META155:![0-9]+]], !"vzipq_u32: %agg.result"} +// CHECK: [[META155]] = distinct !{[[META155]], !"vzipq_u32"} +// CHECK: [[META156]] = !{[[META157:![0-9]+]]} +// CHECK: [[META157]] = distinct !{[[META157]], [[META158:![0-9]+]], !"vzipq_f32: %agg.result"} +// CHECK: [[META158]] = distinct !{[[META158]], !"vzipq_f32"} +// CHECK: [[META159]] = !{[[META160:![0-9]+]]} +// CHECK: [[META160]] = distinct !{[[META160]], [[META161:![0-9]+]], !"vzipq_p8: %agg.result"} +// CHECK: [[META161]] = distinct !{[[META161]], !"vzipq_p8"} +// CHECK: [[META162]] = !{[[META163:![0-9]+]]} +// CHECK: [[META163]] = distinct !{[[META163]], [[META164:![0-9]+]], !"vzipq_p16: %agg.result"} +// CHECK: [[META164]] = distinct !{[[META164]], !"vzipq_p16"} +//. diff --git a/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll b/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll new file mode 100644 index 0000000000000..26642b5ae8208 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-misc-constrained.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vrndaq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vrnda1.i = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #1 + ret <2 x double> %vrnda1.i +} + +define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vrndpq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vrndp1.i = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #1 + ret <2 x double> %vrndp1.i +} + +define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4s, v0.4s +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x float> %vsqrt.i +} + +define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <2 x double> @llvm.experimental.constrained.sqrt.v2f64(<2 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <2 x double> %vsqrt.i +} + +attributes #0 = { noinline nounwind strictfp "target-features"="+neon" } +attributes #1 = { strictfp } diff --git a/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll b/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll new file mode 100644 index 0000000000000..ae7d09d42c02f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-misc-unconstrained.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vrndaq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frinta v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vrnda1.i = call <2 x double> @llvm.round.v2f64(<2 x double> %a) + ret <2 x double> %vrnda1.i +} + +define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vrndpq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: frintp v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vrndp1.i = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) + ret <2 x double> %vrndp1.i +} + +define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4s, v0.4s +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a) + ret <4 x float> %vsqrt.i +} + +define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.2d, v0.2d +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a) + ret <2 x double> %vsqrt.i +} + +attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-features"="+neon" } diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll new file mode 100644 index 0000000000000..b683682901e8f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-constrained.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_vfmas_lane_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla s0, s1, v2.s[1] +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x float> %c, i64 1 + %0 = call float @llvm.experimental.constrained.fma.f32(float %b, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %0 +} + +define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 { +; CHECK-LABEL: test_vfmad_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <1 x double> %c, i64 0 + %0 = call double @llvm.experimental.constrained.fma.f64(double %b, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %0 +} + +define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 { +; CHECK-LABEL: test_vfmad_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla d0, d1, v2.d[1] +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x double> %c, i64 1 + %0 = call double @llvm.experimental.constrained.fma.f64(double %b, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %0 +} + +define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_vfmss_lane_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls s0, s1, v2.s[1] +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %c, i64 1 + %0 = call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %0 +} + +define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +; CHECK-LABEL: test_vfma_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %fmla2 = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %b, <1 x double> %v, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <1 x double> %fmla2 +} + +define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +; CHECK-LABEL: test_vfms_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %fneg = fneg <1 x double> %b + %fmla2 = call <1 x double> @llvm.experimental.constrained.fma.v1f64(<1 x double> %fneg, <1 x double> %v, <1 x double> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <1 x double> %fmla2 +} + +define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +; CHECK-LABEL: test_vfma_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %extract = extractelement <2 x double> %v, i64 0 + %2 = call double @llvm.experimental.constrained.fma.f64(double %1, double %extract, double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %3 = bitcast double %2 to <1 x double> + ret <1 x double> %3 +} + +define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +; CHECK-LABEL: test_vfms_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %2 = fneg double %1 + %extract = extractelement <2 x double> %v, i64 0 + %3 = call double @llvm.experimental.constrained.fma.f64(double %2, double %extract, double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %4 = bitcast double %3 to <1 x double> + ret <1 x double> %4 +} + +attributes #0 = { noinline nounwind strictfp "target-cpu"="cyclone" "target-features"="+neon" } +attributes #1 = { strictfp } diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll new file mode 100644 index 0000000000000..eb3f572ef7072 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-scalar-x-indexed-elem-unconstrained.ll @@ -0,0 +1,103 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_vfmas_lane_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla s0, s1, v2.s[1] +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x float> %c, i64 1 + %0 = call float @llvm.fma.f32(float %b, float %extract, float %a) + ret float %0 +} + +define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 { +; CHECK-LABEL: test_vfmad_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %extract = extractelement <1 x double> %c, i64 0 + %0 = call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 { +; CHECK-LABEL: test_vfmad_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla d0, d1, v2.d[1] +; CHECK-NEXT: ret +entry: + %extract = extractelement <2 x double> %c, i64 1 + %0 = call double @llvm.fma.f64(double %b, double %extract, double %a) + ret double %0 +} + +define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 { +; CHECK-LABEL: test_vfmss_lane_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls s0, s1, v2.s[1] +; CHECK-NEXT: ret +entry: + %fneg = fneg float %b + %extract = extractelement <2 x float> %c, i64 1 + %0 = call float @llvm.fma.f32(float %fneg, float %extract, float %a) + ret float %0 +} + +define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +; CHECK-LABEL: test_vfma_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %fmla2 = call <1 x double> @llvm.fma.v1f64(<1 x double> %b, <1 x double> %v, <1 x double> %a) + ret <1 x double> %fmla2 +} + +define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 { +; CHECK-LABEL: test_vfms_lane_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %fneg = fneg <1 x double> %b + %fmla2 = call <1 x double> @llvm.fma.v1f64(<1 x double> %fneg, <1 x double> %v, <1 x double> %a) + ret <1 x double> %fmla2 +} + +define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +; CHECK-LABEL: test_vfma_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmadd d0, d1, d2, d0 +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %extract = extractelement <2 x double> %v, i64 0 + %2 = call double @llvm.fma.f64(double %1, double %extract, double %0) + %3 = bitcast double %2 to <1 x double> + ret <1 x double> %3 +} + +define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 { +; CHECK-LABEL: test_vfms_laneq_f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmsub d0, d2, d1, d0 +; CHECK-NEXT: ret +entry: + %0 = extractelement <1 x double> %a, i64 0 + %1 = extractelement <1 x double> %b, i64 0 + %2 = fneg double %1 + %extract = extractelement <2 x double> %v, i64 0 + %3 = call double @llvm.fma.f64(double %2, double %extract, double %0) + %4 = bitcast double %3 to <1 x double> + ret <1 x double> %4 +} + +attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-cpu"="cyclone" "target-features"="+neon" } + diff --git a/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll new file mode 100644 index 0000000000000..58f1accdf91d5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-constrained.ll @@ -0,0 +1,276 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define <4 x half> @test_vsqrt_f16(<4 x half> %a) #0 { +; CHECK-LABEL: test_vsqrt_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <4 x half> @llvm.experimental.constrained.sqrt.v4f16(<4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %vsqrt.i +} + +define <8 x half> @test_vsqrtq_f16(<8 x half> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.8h, v0.8h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %vsqrt.i +} + +define <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <4 x half> %b + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg.i, <4 x half> %c, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <8 x half> %b + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg.i, <8 x half> %c, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfma_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %b, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %b, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfma_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmaq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define half @test_vfmah_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i64 3 + %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %0 +} + +define half @test_vfmah_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i64 7 + %0 = call half @llvm.experimental.constrained.fma.f16(half %b, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %0 +} + +define <4 x half> @test_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %lane, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %lane, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfms_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %lane, <4 x half> %fneg, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %lane, <8 x half> %fneg, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfms_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %fneg, <4 x half> %vecinit3, <4 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmsq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %fneg, <8 x half> %vecinit7, <8 x half> %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret <8 x half> %0 +} + +define half @test_vfmsh_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fneg s1, s1 +; CHECK-NEXT: fcvt h1, s1 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1 + %fneg = fneg float %conv + %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %extract = extractelement <4 x half> %c, i64 3 + %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %1 +} + +define half @test_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvt s1, h1 +; CHECK-NEXT: fneg s1, s1 +; CHECK-NEXT: fcvt h1, s1 +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %conv = call float @llvm.experimental.constrained.fpext.f32.f16(half %b, metadata !"fpexcept.strict") #1 + %fneg = fneg float %conv + %0 = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %fneg, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + %extract = extractelement <8 x half> %c, i64 7 + %1 = call half @llvm.experimental.constrained.fma.f16(half %0, half %extract, half %a, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %1 +} + +attributes #0 = { noinline nounwind strictfp "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } +attributes #1 = { strictfp } diff --git a/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll new file mode 100644 index 0000000000000..d5f0e789ec8c3 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/v8.2a-neon-intrinsics-unconstrained.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s +target triple = "arm64-none-linux-gnu" + +define <4 x half> @test_vsqrt_f16(<4 x half> %a) #0 { +; CHECK-LABEL: test_vsqrt_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.4h, v0.4h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a) + ret <4 x half> %vsqrt.i +} + +define <8 x half> @test_vsqrtq_f16(<8 x half> %a) #0 { +; CHECK-LABEL: test_vsqrtq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fsqrt v0.8h, v0.8h +; CHECK-NEXT: ret +entry: + %vsqrt.i = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a) + ret <8 x half> %vsqrt.i +} + +define <4 x half> @test_vfma_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v2.4h, v1.4h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <4 x half> %b + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg.i, <4 x half> %c, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v2.8h, v1.8h +; CHECK-NEXT: ret +entry: + %fneg.i = fneg <8 x half> %b + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg.i, <8 x half> %c, <8 x half> %a) + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfma_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %lane, <4 x half> %a) + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %lane, <8 x half> %a) + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfma_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %lane, <4 x half> %b, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmaq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %lane, <8 x half> %b, <8 x half> %a) + ret <8 x half> %0 +} + +define <4 x half> @test_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfma_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %vecinit3, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmaq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmla v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %vecinit7, <8 x half> %a) + ret <8 x half> %0 +} + +define half @test_vfmah_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmla h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %extract = extractelement <4 x half> %c, i64 3 + %0 = call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + +define half @test_vfmah_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmah_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %extract = extractelement <8 x half> %c, i64 7 + %0 = call half @llvm.fma.f16(half %b, half %extract, half %a) + ret half %0 +} + +define <4 x half> @test_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfms_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <4 x i32> + %fmla2 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg, <4 x half> %lane, <4 x half> %a) + ret <4 x half> %fmla2 +} + +define <8 x half> @test_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[3] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <4 x half> %c, <4 x half> poison, <8 x i32> + %fmla2 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg, <8 x half> %lane, <8 x half> %a) + ret <8 x half> %fmla2 +} + +define <4 x half> @test_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfms_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <4 x i32> + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %lane, <4 x half> %fneg, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsq_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[7] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %lane = shufflevector <8 x half> %c, <8 x half> poison, <8 x i32> + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %lane, <8 x half> %fneg, <8 x half> %a) + ret <8 x half> %0 +} + +define <4 x half> @test_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfms_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <4 x half> %b + %vecinit = insertelement <4 x half> poison, half %c, i64 0 + %vecinit3 = shufflevector <4 x half> %vecinit, <4 x half> poison, <4 x i32> zeroinitializer + %0 = call <4 x half> @llvm.fma.v4f16(<4 x half> %fneg, <4 x half> %vecinit3, <4 x half> %a) + ret <4 x half> %0 +} + +define <8 x half> @test_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) #0 { +; CHECK-LABEL: test_vfmsq_n_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $h2 killed $h2 def $q2 +; CHECK-NEXT: fmls v0.8h, v1.8h, v2.h[0] +; CHECK-NEXT: ret +entry: + %fneg = fneg <8 x half> %b + %vecinit = insertelement <8 x half> poison, half %c, i64 0 + %vecinit7 = shufflevector <8 x half> %vecinit, <8 x half> poison, <8 x i32> zeroinitializer + %0 = call <8 x half> @llvm.fma.v8f16(<8 x half> %fneg, <8 x half> %vecinit7, <8 x half> %a) + ret <8 x half> %0 +} + +define half @test_vfmsh_lane_f16(half %a, half %b, <4 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_lane_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 +; CHECK-NEXT: fmls h0, h1, v2.h[3] +; CHECK-NEXT: ret +entry: + %0 = fneg half %b + %extract = extractelement <4 x half> %c, i64 3 + %1 = call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +} + +define half @test_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c) #0 { +; CHECK-LABEL: test_vfmsh_laneq_f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmls h0, h1, v2.h[7] +; CHECK-NEXT: ret +entry: + %0 = fneg half %b + %extract = extractelement <8 x half> %c, i64 7 + %1 = call half @llvm.fma.f16(half %0, half %extract, half %a) + ret half %1 +} + +attributes #0 = { noinline nounwind "no-trapping-math"="true" "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" } diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index b108a21dbc52b..329e8ecfbab0c 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -557,6 +557,11 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False): UTC_AVOID = "NOTE: Do not autogenerate" UNUSED_NOTE = "NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:" +DATA_LAYOUT_RE = re.compile( + r"target\sdatalayout\s=\s\"(?P.+)\"$", + flags=(re.M | re.S) +) + OPT_FUNCTION_RE = re.compile( r"^(\s*;\s*Function\sAttrs:\s(?P[\w\s():,]+?))?\s*define\s+(?P[^@]*)@(?P[\w.$-]+?)\s*" r"(?P\((\)|(.*?[\w.-]+?)\))[^{]*\{)\n(?P.*?)^\}$", @@ -650,6 +655,16 @@ def get_triple_from_march(march): print("Cannot find a triple. Assume 'x86'", file=sys.stderr) return "x86" +def get_global_underscores(raw_tool_output): + m = DATA_LAYOUT_RE.search(raw_tool_output) + if not m: + return False + data_layout = m.group("layout") + idx = data_layout.find("m:") + if idx < 0: + return False + ch = data_layout[idx + 2] + return ch == 'o' or ch == 'x' def apply_filters(line, filters): has_filter = False diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py index 3ffb07ddf6ad8..062971687ab23 100755 --- a/llvm/utils/update_cc_test_checks.py +++ b/llvm/utils/update_cc_test_checks.py @@ -34,7 +34,7 @@ } -def get_line2func_list(args, clang_args): +def get_line2func_list(args, clang_args, global_underscores): ret = collections.defaultdict(list) # Use clang's JSON AST dump to get the mangled name json_dump_args = [args.clang] + clang_args + ["-fsyntax-only", "-o", "-"] @@ -122,6 +122,11 @@ def parse_clang_ast_json(node, loc, search): if search is None: search = spell mangled = node.get("mangledName", spell) + # Strip leading underscore from globals, so the name matches the LLVM one + if global_underscores: + storage = node.get("storageClass", None) + if storage != "static" and mangled[0] == '_': + mangled = mangled[1:] ret[int(line) - 1].append((spell, mangled, search)) ast = json.loads(stdout) @@ -249,10 +254,8 @@ def config(): return args, parser -def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes): +def get_function_body(builder, args, filename, clang_args, extra_commands, prefixes, raw_tool_output): # TODO Clean up duplication of asm/common build_function_body_dictionary - # Invoke external tool and extract function bodies. - raw_tool_output = common.invoke_tool(args.clang, clang_args, filename) for extra_command in extra_commands: extra_args = shlex.split(extra_command) with tempfile.NamedTemporaryFile() as f: @@ -383,13 +386,15 @@ def main(): common.debug("Extracted clang cmd: clang {}".format(clang_args)) common.debug("Extracted FileCheck prefixes: {}".format(prefixes)) + # Invoke external tool and extract function bodies. + raw_tool_output = common.invoke_tool(ti.args.clang, clang_args, ti.path) get_function_body( - builder, ti.args, ti.path, clang_args, extra_commands, prefixes + builder, ti.args, ti.path, clang_args, extra_commands, prefixes, raw_tool_output ) # Invoke clang -Xclang -ast-dump=json to get mapping from start lines to # mangled names. Forward all clang args for now. - for k, v in get_line2func_list(ti.args, clang_args).items(): + for k, v in get_line2func_list(ti.args, clang_args, common.get_global_underscores(raw_tool_output)).items(): line2func_list[k].extend(v) func_dict = builder.finish_and_get_func_dict()